Exemple #1
0
def get_variable_initializer(hparams):
  """Get variable initializer from hparams."""
  if not hparams.initializer:
    return None

  mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN,
                               value=hparams.initializer_gain,
                               hparams=hparams)

  if not tf.contrib.eager.in_eager_mode():
    tf.logging.info("Using variable initializer: %s", hparams.initializer)
  if hparams.initializer == "orthogonal":
    return tf.orthogonal_initializer(gain=hparams.initializer_gain)
  elif hparams.initializer == "uniform":
    max_val = 0.1 * hparams.initializer_gain
    return tf.random_uniform_initializer(-max_val, max_val)
  elif hparams.initializer == "normal_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="normal")
  elif hparams.initializer == "uniform_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="uniform")
  elif hparams.initializer == "xavier":
    return tf.contrib.layers.xavier_initializer()
  else:
    raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
Exemple #2
0
 def train(self, max_steps=None):
   mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP)
   mlperf_log.transformer_print(key=mlperf_log.TRAIN_EPOCH, value=0)
   self._estimator.train(
       self._train_spec.input_fn,
       hooks=self._train_spec.hooks,
       max_steps=max_steps or self._train_spec.max_steps)
def generate_files(generator, output_filenames,
                   max_cases=None, cycle_every_n=1):
  """Generate cases from a generator and save as TFRecord files.

  Generated cases are transformed to tf.Example protos and saved as TFRecords
  in sharded files named output_dir/output_name-00..N-of-00..M=num_shards.

  Args:
    generator: a generator yielding (string -> int/float/str list) dictionaries.
    output_filenames: List of output file paths.
    max_cases: maximum number of cases to get from the generator;
      if None (default), we use the generator until StopIteration is raised.
    cycle_every_n: how many cases from the generator to take before
      switching to the next shard; by default set to 1, switch every case.
  """
  if outputs_exist(output_filenames):
    tf.logging.info("Skipping generator because outputs files exists at {}"
                    .format(output_filenames))
    return
  tmp_filenames = [fname + ".incomplete" for fname in output_filenames]
  num_shards = len(output_filenames)
  # Check if is training or eval, ref: train_data_filenames().
  if num_shards > 0:
    if "-train" in output_filenames[0]:
      tag = "train"
    elif "-dev" in output_filenames[0]:
      tag = "eval"
    else:
      tag = "other"

  writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filenames]
  counter, shard = 0, 0
  for case in generator:
    if case is None:
      continue
    if counter % 100000 == 0:
      tf.logging.info("Generating case %d." % counter)
    counter += 1
    if max_cases and counter > max_cases:
      break
    example = to_example(case)
    writers[shard].write(example.SerializeToString())
    if counter % cycle_every_n == 0:
      shard = (shard + 1) % num_shards

  for writer in writers:
    writer.close()

  for tmp_name, final_name in zip(tmp_filenames, output_filenames):
    tf.gfile.Rename(tmp_name, final_name)

  if num_shards > 0:
    if tag == "train":
      mlperf_log.transformer_print(
          key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=counter)
    elif tag == "eval":
      mlperf_log.transformer_print(
          key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=counter)

  tf.logging.info("Generated %s Examples", counter)
def generate_dataset_and_shuffle(train_gen,
                                 train_paths,
                                 dev_gen,
                                 dev_paths,
                                 shuffle=True):
  generate_files(train_gen, train_paths)
  generate_files(dev_gen, dev_paths)
  mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
  if shuffle:
    shuffle_dataset(train_paths + dev_paths)
def learning_rate_schedule(hparams):
  """Learning rate schedule based on hparams."""
  mlperf_log.transformer_print(key=mlperf_log.OPT_LR, deferred=True)
  mlperf_log.transformer_print(
      key=mlperf_log.OPT_LR_WARMUP_STEPS,
      value=hparams.learning_rate_warmup_steps)
  step_num = _global_step(hparams)
  schedule_string = hparams.learning_rate_schedule
  names = schedule_string.split("*")
  names = [name.strip() for name in names if name.strip()]
  ret = tf.constant(1.0)
  for name in names:
    ret *= learning_rate_factor(name, step_num, hparams)
  return ret
Exemple #6
0
def create_estimator(model_name,
                     hparams,
                     run_config,
                     schedule="train_and_evaluate",
                     decode_hparams=None,
                     use_tpu=False,
                     use_tpu_estimator=False,
                     use_xla=False):
  """Create a T2T Estimator."""
  model_fn = t2t_model.T2TModel.make_estimator_model_fn(
      model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu)


  del use_xla
  if use_tpu or use_tpu_estimator:
    problem = hparams.problem
    batch_size = (
        problem.tpu_batch_size_per_shard(hparams) *
        run_config.tpu_config.num_shards)
    mlperf_log.transformer_print(
        key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size)
    if getattr(hparams, "mtf_mode", False):
      batch_size = problem.tpu_batch_size_per_shard(hparams)
    predict_batch_size = batch_size
    if decode_hparams and decode_hparams.batch_size:
      predict_batch_size = decode_hparams.batch_size
    if decode_hparams and run_config.tpu_config:
      decode_hparams.add_hparam("iterations_per_loop",
                                run_config.tpu_config.iterations_per_loop)
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        model_dir=run_config.model_dir,
        config=run_config,
        use_tpu=use_tpu,
        train_batch_size=batch_size,
        eval_batch_size=batch_size if "eval" in schedule else None,
        predict_batch_size=predict_batch_size)
  else:
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=run_config.model_dir,
        config=run_config,
    )
  return estimator
Exemple #7
0
  def train_eval_and_decode(self):
    """Does eval and decode after training every eval_freq_in_steps."""
    eval_steps = self._hparams.eval_freq_in_steps
    packed_dataset = "_packed" in self._hparams.problem.name
    mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP)
    for i in range(0, self._train_spec.max_steps, eval_steps):
      mlperf_log.transformer_print(
          key=mlperf_log.TRAIN_EPOCH, value=i // eval_steps)
      if packed_dataset and i > 0:
        problem = registry.problem(self._hparams.problem.name + "_packed")
        p_hparams = problem.get_hparams(self._hparams)
        self._hparams.problem = problem
        self._hparams.problem_hparams = p_hparams
      self._estimator.train(
          self._train_spec.input_fn,
          steps=eval_steps,
          hooks=self._train_spec.hooks)
      self._estimator.evaluate(
          self._eval_spec.input_fn,
          steps=self._eval_spec.steps,
          hooks=self._eval_spec.hooks)
      if packed_dataset:
        problem = registry.problem(
            self._hparams.problem.name.replace("_packed", ""))
        p_hparams = problem.get_hparams(self._hparams)
        self._hparams.problem = problem
        self._hparams.problem_hparams = p_hparams
      mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
      if self._hparams.mlperf_mode:
        self._decode_hparams.mlperf_decode_step = i + eval_steps
      self.decode(dataset_split=tf.estimator.ModeKeys.EVAL)
      d_hparams = self._decode_hparams
      if self._hparams.mlperf_mode and d_hparams.mlperf_success:
        mlperf_log.transformer_print(
            key=mlperf_log.RUN_STOP, value={"success": "true"})
        break

    d_hparams = self._decode_hparams
    if self._hparams.mlperf_mode and not d_hparams.mlperf_success:
      mlperf_log.transformer_print(
          key=mlperf_log.RUN_STOP, value={"success": "false"})
Exemple #8
0
  def continuous_decode_on_eval_data(self):
    """Decode from dataset on new checkpoint."""
    if self._hparams.mlperf_mode:
      ckpt_generator = next_undecoded_checkpoint(self._hparams.model_dir)
    else:
      ckpt_generator = next_checkpoint(self._hparams.model_dir)

    for ckpt in ckpt_generator:
      current_step = int(os.path.basename(ckpt).split("-")[1])
      tf.logging.info("Decoding step %d" % current_step)
      # Skip checkpoint 0.
      if current_step == 0:
        continue
      # Decode the latest checkpoint by default.
      checkpoint_path = None
      if self._hparams.mlperf_mode:
        self._decode_hparams.mlperf_decode_step = current_step
        checkpoint_path = ckpt

      mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
      self.decode(
          dataset_split=tf.estimator.ModeKeys.EVAL,
          checkpoint_path=checkpoint_path)
      d_hparams = self._decode_hparams
      if self._hparams.mlperf_mode and d_hparams.mlperf_success:
        mlperf_log.transformer_print(
            key=mlperf_log.RUN_STOP, value={"success": "true"})
        break

    d_hparams = self._decode_hparams
    if self._hparams.mlperf_mode and not d_hparams.mlperf_success:
      mlperf_log.transformer_print(
          key=mlperf_log.RUN_STOP, value={"success": "false"})
Exemple #9
0
def corpus_token_counts(
    text_filepattern, corpus_max_lines, split_on_newlines=True):
  """Read the corpus and compute a dictionary of token counts.

  Args:
    text_filepattern: A pattern matching one or more files.
    corpus_max_lines: An integer; maximum total lines to read.
    split_on_newlines: A boolean. If true, then split files by lines and strip
        leading and trailing whitespace from each line. Otherwise, treat each
        file as a single string.

  Returns:
    a dictionary mapping token to count.
  """
  counts = collections.Counter()
  for doc in _read_filepattern(
      text_filepattern,
      max_lines=corpus_max_lines,
      split_on_newlines=split_on_newlines):
    counts.update(encode(_native_to_unicode(doc)))

  mlperf_log.transformer_print(
      key=mlperf_log.PREPROC_VOCAB_SIZE, value=len(counts))
  return counts
Exemple #10
0
    def continuous_decode_on_eval_data(self):
        """Decode from dataset on new checkpoint."""
        for ckpt in next_checkpoint(self._hparams.model_dir):
            current_step = int(os.path.basename(ckpt).split("-")[1])
            # Skip checkpoint 0.
            if current_step == 0:
                continue
            mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
            self.decode(dataset_split=tf.estimator.ModeKeys.EVAL)
            d_hparams = self._decode_hparams
            if d_hparams.mlperf_mode and d_hparams.mlperf_success:
                mlperf_log.transformer_print(key=mlperf_log.RUN_STOP,
                                             value={"success": "true"})
                break

        d_hparams = self._decode_hparams
        if d_hparams.mlperf_mode and not d_hparams.mlperf_success:
            mlperf_log.transformer_print(key=mlperf_log.RUN_STOP,
                                         value={"success": "false"})
Exemple #11
0
def main(argv):
  tf.logging.set_verbosity(tf.logging.INFO)

  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

  # If we just have to print the registry, do that and exit early.
  maybe_log_registry_and_exit()

  # Create HParams.
  if argv:
    set_hparams_from_args(argv[1:])
  if FLAGS.schedule != "run_std_server":
    hparams = create_hparams()
  if FLAGS.gpu_automatic_mixed_precision:
    setattr(hparams, "gpu_automatic_mixed_precision", True)

  if FLAGS.schedule == "train" or FLAGS.schedule == "train_eval_and_decode":
    mlperf_log.transformer_print(key=mlperf_log.RUN_START, hparams=hparams)
  if FLAGS.schedule == "run_std_server":
    run_std_server()
  mlperf_log.transformer_print(
      key=mlperf_log.RUN_SET_RANDOM_SEED, value=FLAGS.random_seed,
      hparams=hparams)
  trainer_lib.set_random_seed(FLAGS.random_seed)

  if FLAGS.cloud_mlengine:
    cloud_mlengine.launch()
    return

  if FLAGS.generate_data:
    generate_data()

  if cloud_mlengine.job_dir():
    FLAGS.output_dir = cloud_mlengine.job_dir()

  exp_fn = create_experiment_fn()
  exp = exp_fn(create_run_config(hparams), hparams)
  if is_chief():
    save_metadata(hparams)
  execute_schedule(exp)
  if FLAGS.schedule != "train":
    mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL,
                                 hparams=hparams)
Exemple #12
0
def compute_bleu_summaries(hook_args):
    """Compute BLEU core summaries using the decoder output.

  Args:
    hook_args: DecodeHookArgs namedtuple
  Returns:
    A list of tf.Summary values if hook_args.hparams contains the
    reference file and the translated file.
  """
    decode_hparams = hook_args.decode_hparams
    estimator = hook_args.estimator
    current_step = estimator.get_variable_value(tf.GraphKeys.GLOBAL_STEP)
    has_iters = hasattr(decode_hparams, "iterations_per_loop")
    if current_step and has_iters and decode_hparams.iterations_per_loop:
        iterations_per_loop = decode_hparams.iterations_per_loop
        current_epoch = np.asscalar(current_step) // iterations_per_loop
    else:
        current_epoch = 0

    if (decode_hparams.decode_reference is None
            or decode_hparams.decode_to_file is None):
        return None

    values = []
    bleu = 100 * bleu_hook.bleu_wrapper(decode_hparams.decode_reference,
                                        decode_hparams.decode_to_file)
    values.append(tf.Summary.Value(tag="BLEU", simple_value=bleu))
    tf.logging.info("%s: BLEU = %6.2f" % (decode_hparams.decode_to_file, bleu))
    if decode_hparams.mlperf_mode:
        mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET,
                                     value=decode_hparams.mlperf_threshold)
        mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY,
                                     value={
                                         "epoch": max(current_epoch - 1, 0),
                                         "value": bleu
                                     })
        mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)

    if bleu >= decode_hparams.mlperf_threshold:
        decode_hparams.set_hparam("mlperf_success", True)

    return values
Exemple #13
0
def compute_bleu_summaries(hook_args):
    """Compute BLEU core summaries using the decoder output.

  Args:
    hook_args: DecodeHookArgs namedtuple
  Returns:
    A list of tf.Summary values if hook_args.hparams contains the
    reference file and the translated file.
  """
    decode_hparams = hook_args.decode_hparams

    if (decode_hparams.decode_reference is None
            or decode_hparams.decode_to_file is None):
        return None

    values = []
    bleu = 100 * bleu_hook.bleu_wrapper(decode_hparams.decode_reference,
                                        decode_hparams.decode_to_file)
    values.append(tf.Summary.Value(tag="BLEU", simple_value=bleu))
    tf.logging.info("%s: BLEU = %6.2f" % (decode_hparams.decode_to_file, bleu))
    if hook_args.hparams.mlperf_mode:
        current_step = decode_hparams.mlperf_decode_step
        mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET,
                                     value=decode_hparams.mlperf_threshold)
        mlperf_log.transformer_print(
            key=mlperf_log.EVAL_ACCURACY,
            value={
                "epoch":
                max(current_step // decode_hparams.iterations_per_loop - 1, 0),
                "value":
                bleu
            })
        mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)

    if bleu >= decode_hparams.mlperf_threshold:
        decode_hparams.set_hparam("mlperf_success", True)

    return values
Exemple #14
0
def compute_bleu_summaries(hook_args):
  """Compute BLEU core summaries using the decoder output.

  Args:
    hook_args: DecodeHookArgs namedtuple
  Returns:
    A list of tf.Summary values if hook_args.hparams contains the
    reference file and the translated file.
  """
  decode_hparams = hook_args.decode_hparams

  if (decode_hparams.decode_reference is None or
      decode_hparams.decode_to_file is None):
    return None

  values = []
  bleu = 100 * bleu_hook.bleu_wrapper(
      decode_hparams.decode_reference, decode_hparams.decode_to_file)
  values.append(tf.Summary.Value(tag="BLEU", simple_value=bleu))
  tf.logging.info("%s: BLEU = %6.2f" % (decode_hparams.decode_to_file, bleu))
  if hook_args.hparams.mlperf_mode:
    current_step = decode_hparams.mlperf_decode_step
    mlperf_log.transformer_print(
        key=mlperf_log.EVAL_TARGET, value=decode_hparams.mlperf_threshold)
    mlperf_log.transformer_print(
        key=mlperf_log.EVAL_ACCURACY,
        value={
            "epoch": max(current_step // decode_hparams.iterations_per_loop - 1,
                         0),
            "value": bleu
        })
    mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)

  if bleu >= decode_hparams.mlperf_threshold:
    decode_hparams.set_hparam("mlperf_success", True)

  return values
Exemple #15
0
    def continuous_decode_on_eval_data(self):
        """Decode from dataset on new checkpoint."""
        if self._hparams.mlperf_mode:
            ckpt_generator = next_undecoded_checkpoint(
                self._hparams.model_dir,
                self._decode_hparams.decode_timeout_mins)
        else:
            ckpt_generator = next_checkpoint(
                self._hparams.model_dir,
                self._decode_hparams.decode_timeout_mins)

        for ckpt in ckpt_generator:
            current_step = decoding.get_step_from_ckpt_path(ckpt)
            tf.logging.info("Decoding step %d" % current_step)
            # Skip checkpoint 0.
            if current_step == 0:
                continue
            # Decode the latest checkpoint by default.
            checkpoint_path = None
            if self._hparams.mlperf_mode:
                self._decode_hparams.mlperf_decode_step = current_step
                checkpoint_path = ckpt

            mlperf_log.transformer_print(key=mlperf_log.EVAL_START)
            self.decode(dataset_split=tf.estimator.ModeKeys.EVAL,
                        checkpoint_path=checkpoint_path)
            d_hparams = self._decode_hparams
            if self._hparams.mlperf_mode and d_hparams.mlperf_success:
                mlperf_log.transformer_print(key=mlperf_log.RUN_STOP,
                                             value={"success": "true"})
                break

        d_hparams = self._decode_hparams
        if self._hparams.mlperf_mode and not d_hparams.mlperf_success:
            mlperf_log.transformer_print(key=mlperf_log.RUN_STOP,
                                         value={"success": "false"})
Exemple #16
0
def main(argv):
    tf.logging.set_verbosity(tf.logging.INFO)
    hparams = create_hparams()
    if FLAGS.schedule == "train" or FLAGS.schedule == "train_eval_and_decode":
        mlperf_log.transformer_print(key=mlperf_log.RUN_START,
                                     mlperf_mode=hparams.mlperf_mode)
    if FLAGS.schedule == "run_std_server":
        run_std_server()
    mlperf_log.transformer_print(key=mlperf_log.RUN_SET_RANDOM_SEED,
                                 value=FLAGS.random_seed,
                                 mlperf_mode=hparams.mlperf_mode)
    trainer_lib.set_random_seed(FLAGS.random_seed)
    usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
    maybe_log_registry_and_exit()

    if FLAGS.cloud_mlengine:
        cloud_mlengine.launch()
        return

    if FLAGS.generate_data:
        generate_data()

    if cloud_mlengine.job_dir():
        FLAGS.output_dir = cloud_mlengine.job_dir()

    if argv:
        set_hparams_from_args(argv[1:])

    exp_fn = create_experiment_fn()
    exp = exp_fn(create_run_config(hparams), hparams)
    if is_chief():
        save_metadata(hparams)
    execute_schedule(exp)
    if FLAGS.schedule != "train":
        mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL,
                                     mlperf_mode=hparams.mlperf_mode)
Exemple #17
0
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 force_repeat=False,
                 prevent_repeat=False,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      force_repeat: bool, whether to repeat the data even if not training
      prevent_repeat: bool, whether to not repeat when in training mode.
        Overrides force_repeat.
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
        partition_id, num_partitions = self._dataset_partition(mode, config)

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        if config and config.use_tpu:
            num_threads = 64
        else:
            num_threads = cpu_count() if is_training else 1

        if config and hasattr(config,
                              "data_parallelism") and config.data_parallelism:
            num_shards = config.data_parallelism.n
        else:
            num_shards = 1

        max_length = self.max_length(hparams)
        mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                     value=max_length)

        def tpu_valid_size(example):
            return data_reader.example_valid_size(example, hparams.min_length,
                                                  max_length)

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example, hparams.min_length,
                max_length if drop_long_sequences else 10**9)

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or (hasattr(hparams, "data_dir")
                                and hparams.data_dir)

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams,
            "partition_id": partition_id,
            "num_partitions": num_partitions,
        })

        dataset = self.dataset(**dataset_kwargs)
        if (force_repeat or is_training) and not prevent_repeat:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()

        if is_training and self.skip_random_fraction_when_training:
            data_files = tf.contrib.slim.parallel_reader.get_data_files(
                self.filepattern(data_dir, mode))
            #  In continuous_train_and_eval when switching between train and
            #  eval, this input_fn method gets called multiple times and it
            #  would give you the exact same samples from the last call
            #  (because the Graph seed is set). So this skip gives you some
            #  shuffling.
            dataset = skip_random_fraction(dataset, data_files[0])

        dataset = dataset.map(data_reader.cast_ints_to_int32,
                              num_parallel_calls=num_threads)

        if self.batch_size_means_tokens:
            batch_size_means_tokens = True
        else:
            if _are_shapes_fully_defined(dataset.output_shapes):
                batch_size_means_tokens = False
            else:
                tf.logging.warning(
                    "Shapes are not fully defined. Assuming batch_size means tokens."
                )
                batch_size_means_tokens = True

        # Batching
        if not batch_size_means_tokens:
            # Batch size means examples per datashard.
            if config and config.use_tpu:
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.batch(batch_size, drop_remainder=True)
            else:
                batch_size = hparams.batch_size * num_shards
                dataset = dataset.batch(batch_size)
        else:
            # batch_size means tokens per datashard
            if config and config.use_tpu:
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = self._pad_for_tpu(dataset.output_shapes,
                                                  hparams)
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                if hparams.pad_batch:
                    tf.logging.warn(
                        "Padding the batch to ensure that remainder eval batches are "
                        "processed. This may lead to incorrect metrics for "
                        "non-zero-padded features, e.g. images. Use a smaller batch "
                        "size that has no remainder in that case.")
                    dataset = dataset.padded_batch(batch_size,
                                                   padded_shapes,
                                                   drop_remainder=False)
                    dataset = dataset.map(functools.partial(
                        pad_batch, batch_multiple=batch_size),
                                          num_parallel_calls=num_threads)
                else:
                    dataset = dataset.padded_batch(batch_size,
                                                   padded_shapes,
                                                   drop_remainder=True)
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=num_shards,
                    length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    # Here  batch_size really means examples per datashard.
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = dataset.apply(
                    tf.contrib.data.bucket_by_sequence_length(
                        data_reader.example_length,
                        batching_scheme["boundaries"],
                        batching_scheme["batch_sizes"]))

                if not is_training:
                    batch_multiple = num_shards
                    if hparams.use_fixed_batch_size:
                        # Make sure the last batch has the same fixed size as the rest.
                        batch_multiple *= hparams.batch_size
                    if batch_multiple > 1:
                        tf.logging.warn(
                            "Padding the batch to ensure that remainder eval batches have "
                            "a batch size divisible by the number of data shards. This may "
                            "lead to incorrect metrics for non-zero-padded features, e.g. "
                            "images. Use a single datashard (i.e. 1 GPU) in that case."
                        )
                        dataset = dataset.map(functools.partial(
                            pad_batch, batch_multiple=batch_multiple),
                                              num_parallel_calls=num_threads)

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

        # Add shuffling for training batches. This is necessary along with record
        # level shuffling in the dataset generation. Record shuffling will shuffle
        # the examples. However, in some cases, it's possible that the shuffle
        # buffer size for record shuffling is smaller than the batch size. In such
        # cases, adding batch shuffling ensures that the data is in random order
        # during training
        if (is_training and hasattr(hparams, "batch_shuffle_size")
                and hparams.batch_shuffle_size):
            dataset = dataset.shuffle(hparams.batch_shuffle_size)

        def prepare_for_output(example):
            if not config or not config.use_tpu:
                _summarize_features(example, num_shards)
            if mode == tf.estimator.ModeKeys.PREDICT:
                example["infer_targets"] = example.pop("targets")
                return example
            else:
                return example, example["targets"]

        dataset = dataset.map(prepare_for_output,
                              num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)

        if mode == tf.estimator.ModeKeys.PREDICT:
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return dataset
Exemple #18
0
  def __init__(self, optimizer_name, lr, hparams, use_tpu=False):  # pylint: disable=super-init-not-called
    tf.logging.info("Using optimizer %s", optimizer_name)

    mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                 value=optimizer_name,
                                 hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1,
        hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2,
        hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_EPSILON,
        value=hparams.optimizer_adam_epsilon,
        hparams=hparams)

    if optimizer_name == "Adam":
      # We change the default epsilon for Adam.
      # Using LazyAdam as it's much faster for large vocabulary embeddings.
      self._opt = tf.contrib.opt.LazyAdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "MultistepAdam":
      self._opt = multistep_optimizer.MultistepAdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon,
          n=hparams.optimizer_multistep_accumulate_steps)
    elif optimizer_name == "Momentum":
      self._opt = tf.train.MomentumOptimizer(
          lr,
          momentum=hparams.optimizer_momentum_momentum,
          use_nesterov=hparams.optimizer_momentum_nesterov)
    elif optimizer_name == "YellowFin":
      self._opt = yellowfin.YellowFinOptimizer(
          learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
    elif optimizer_name == "TrueAdam":
      self._opt = tf.train.AdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "AdamW":
      # Openai gpt used weight decay.
      # Given the internals of AdamW, weight decay dependent on the
      # learning rate is chosen to match the openai implementation.
      # The weight decay update to each parameter is applied before the adam
      # gradients computation, which is different from that described
      # in the paper and in the openai implementation:
      # https://arxiv.org/pdf/1711.05101.pdf
      self._opt = tf.contrib.opt.AdamWOptimizer(
          0.01*lr,
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "Adafactor":
      self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr)
    else:
      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
    if _mixed_precision_is_enabled(hparams):
      if not hparams.mixed_precision_optimizer_loss_scaler:
        tf.logging.warning("Using mixed precision without a loss scaler will "
                           "likely cause numerical errors.")
      elif hparams.mixed_precision_optimizer_loss_scaler != "exponential":
        raise ValueError("Mixed precision training only supports the "
                         "exponential loss scaler")
      else:
        tf.logging.info("Using Exponential Update Loss Scaler")
        manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(
            init_loss_scale=2**15,
            incr_every_n_steps=2000,
            decr_every_n_nan_or_inf=2,
            incr_ratio=2,
            decr_ratio=0.5)
        self._opt = LossScaleOptimizer(self._opt, manager)

    self._zero_grads = hparams.optimizer_zero_grads
Exemple #19
0
 def train(self, max_steps=None):
     mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP)
     self._estimator.train(self._train_spec.input_fn,
                           hooks=self._train_spec.hooks,
                           max_steps=max_steps
                           or self._train_spec.max_steps)
Exemple #20
0
def decode_once(estimator,
                problem_name,
                hparams,
                infer_input_fn,
                decode_hp,
                decode_to_file,
                output_dir,
                log_results=True,
                checkpoint_path=None):
    """Decodes once."""

    # Get the predictions as an iterable
    predictions = estimator.predict(infer_input_fn,
                                    checkpoint_path=checkpoint_path)

    if not log_results:
        return list(predictions)

    # Prepare output file writers if decode_to_file passed
    decode_to_file = decode_to_file or decode_hp.decode_to_file
    if decode_to_file:
        output_filepath = _decode_filename(decode_to_file, problem_name,
                                           decode_hp)
        parts = output_filepath.split(".")
        parts[-1] = "targets"
        target_filepath = ".".join(parts)
        parts[-1] = "inputs"
        input_filepath = ".".join(parts)

        output_file = tf.gfile.Open(output_filepath, "w")
        target_file = tf.gfile.Open(target_filepath, "w")
        input_file = tf.gfile.Open(input_filepath, "w")

    problem_hparams = hparams.problem_hparams
    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    has_input = "inputs" in problem_hparams.vocabulary
    inputs_vocab_key = "inputs" if has_input else "targets"
    inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
    targets_vocab = problem_hparams.vocabulary["targets"]

    num_eval_samples = 0
    for num_predictions, prediction in enumerate(predictions):
        num_eval_samples += 1
        num_predictions += 1
        inputs = prediction.get("inputs")
        targets = prediction.get("targets")
        outputs = prediction.get("outputs")

        # Log predictions
        decoded_outputs = []
        decoded_scores = []
        if decode_hp.return_beams:
            output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
            scores = None
            if "scores" in prediction:
                scores = np.split(prediction["scores"],
                                  decode_hp.beam_size,
                                  axis=0)
            for i, beam in enumerate(output_beams):
                tf.logging.info("BEAM %d:" % i)
                score = scores and scores[i]
                decoded = log_decode_results(
                    inputs,
                    beam,
                    problem_name,
                    num_predictions,
                    inputs_vocab,
                    targets_vocab,
                    save_images=decode_hp.save_images,
                    output_dir=output_dir,
                    identity_output=decode_hp.identity_output,
                    targets=targets,
                    log_results=decode_hp.log_results)
                decoded_outputs.append(decoded)
                if decode_hp.write_beam_scores:
                    decoded_scores.append(score)
        else:
            decoded = log_decode_results(
                inputs,
                outputs,
                problem_name,
                num_predictions,
                inputs_vocab,
                targets_vocab,
                save_images=decode_hp.save_images,
                output_dir=output_dir,
                identity_output=decode_hp.identity_output,
                targets=targets,
                log_results=decode_hp.log_results)
            decoded_outputs.append(decoded)

        # Write out predictions if decode_to_file passed
        if decode_to_file:
            for i, (d_input, d_output, d_target) in enumerate(decoded_outputs):
                # Skip if all padding
                if d_input and re.match("^({})+$".format(text_encoder.PAD),
                                        d_input):
                    continue
                beam_score_str = ""
                if decode_hp.write_beam_scores:
                    beam_score_str = "\t%.2f" % decoded_scores[i]
                output_file.write(
                    str(d_output) + beam_score_str + decode_hp.delimiter)
                target_file.write(str(d_target) + decode_hp.delimiter)
                input_file.write(str(d_input) + decode_hp.delimiter)

        if (decode_hp.num_samples >= 0
                and num_predictions >= decode_hp.num_samples):
            break

    mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE,
                                 value=num_eval_samples,
                                 hparams=hparams)

    if decode_to_file:
        output_file.close()
        target_file.close()
        input_file.close()
Exemple #21
0
  def __init__(self, optimizer_name, lr, hparams, use_tpu=False):  # pylint: disable=super-init-not-called
    tf.logging.info("Using optimizer %s", optimizer_name)

    mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                 value=optimizer_name,
                                 hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1,
        hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2,
        hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_EPSILON,
        value=hparams.optimizer_adam_epsilon,
        hparams=hparams)

    if optimizer_name == "Adam":
      # We change the default epsilon for Adam.
      # Using LazyAdam as it's much faster for large vocabulary embeddings.
      self._opt = tf.contrib.opt.LazyAdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "MultistepAdam":
      self._opt = multistep_optimizer.MultistepAdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon,
          n=hparams.optimizer_multistep_accumulate_steps)
    elif optimizer_name == "Momentum":
      self._opt = tf.train.MomentumOptimizer(
          lr,
          momentum=hparams.optimizer_momentum_momentum,
          use_nesterov=hparams.optimizer_momentum_nesterov)
    elif optimizer_name == "YellowFin":
      self._opt = yellowfin.YellowFinOptimizer(
          learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
    elif optimizer_name == "TrueAdam":
      self._opt = tf.train.AdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "AdamW":
      # Openai gpt used weight decay.
      # Given the internals of AdamW, weight decay dependent on the
      # learning rate is chosen to match the openai implementation.
      # The weight decay update to each parameter is applied before the adam
      # gradients computation, which is different from that described
      # in the paper and in the openai implementation:
      # https://arxiv.org/pdf/1711.05101.pdf
      self._opt = tf.contrib.opt.AdamWOptimizer(
          0.01*lr,
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "Adafactor":
      self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr)
    else:
      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)

    self._zero_grads = hparams.optimizer_zero_grads
def transformer_ffn_layer(x, hparams, pad_remover=None):
    """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.

  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
    ffn_layer = hparams.ffn_layer
    if ffn_layer != "dense_relu_dense":
        raise ValueError(
            "sparse transformer only supports dense_relu_dense ffn.")

    relu_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "relu_dropout_broadcast_dims", "")))
    # In simple convolution mode, use `pad_remover` to speed up processing.
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE,
                                 value={
                                     "filter_size": hparams.filter_size,
                                     "use_bias": "True",
                                     "activation": mlperf_log.RELU
                                 })
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE,
                                 value={
                                     "hidden_size": hparams.hidden_size,
                                     "use_bias": "True",
                                 })
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_RELU_DROPOUT,
                                 value=hparams.relu_dropout)
    if pad_remover:
        original_shape = common_layers.shape_list(x)
        # Collapse `x` across examples, and remove padding positions.
        x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
        x = tf.expand_dims(pad_remover.remove(x), axis=0)

    initial_sparsity = None
    if hparams.get("load_masks_from"):
        initial_sparsity = hparams.get("initial_sparsity")

    conv_output = sparse_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims,
        sparsity_technique=hparams.get("sparsity_technique"),
        threshold=hparams.get("log_alpha_threshold"),
        training=hparams.get("mode") == tf_estimator.ModeKeys.TRAIN,
        clip_alpha=hparams.get("clip_log_alpha"),
        initial_sparsity=initial_sparsity)
    if pad_remover:
        # Restore `conv_output` to the original shape of `x`, including padding.
        conv_output = tf.reshape(
            pad_remover.restore(tf.squeeze(conv_output, axis=0)),
            original_shape)
    return conv_output
def transformer_decode(decoder_function,
                       decoder_input,
                       encoder_output,
                       encoder_decoder_attention_bias,
                       decoder_self_attention_bias,
                       hparams,
                       attention_weights=None,
                       cache=None,
                       decode_loop_step=None,
                       nonpadding=None,
                       losses=None,
                       **kwargs):
    """Decode Transformer outputs from encoder representation.

  Args:
    decoder_function: the decoder function
    decoder_input: inputs to bottom of the model. [batch_size, decoder_length,
      hidden_dim]
    encoder_output: Encoder representation. [batch_size, input_length,
      hidden_dim]
    encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder
      attention. [batch_size, input_length]
    decoder_self_attention_bias: Bias and mask weights for decoder
      self-attention. [batch_size, decoder_length]
    hparams: hyperparameters for model.
    attention_weights: weight to store attention to.
    cache: dict, containing tensors which are the results of previous
      attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop. Only used
      for inference on TPU.
    nonpadding: optional Tensor with shape [batch_size, decoder_length]
    losses: optional list onto which to append extra training losses
    **kwargs: additional arguments to pass to decoder_function

  Returns:
    Final decoder representation. [batch_size, decoder_length, hidden_dim]
  """
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = decoder_function(decoder_input,
                                      encoder_output,
                                      decoder_self_attention_bias,
                                      encoder_decoder_attention_bias,
                                      hparams,
                                      cache=cache,
                                      decode_loop_step=decode_loop_step,
                                      nonpadding=nonpadding,
                                      save_weights_to=attention_weights,
                                      losses=losses,
                                      **kwargs)

    if (common_layers.is_xla_compiled()
            and hparams.mode == tf.estimator.ModeKeys.TRAIN):
        # TPU does not react kindly to extra dimensions.
        # TODO(noam): remove this once TPU is more forgiving of extra dims.
        return decoder_output
    else:
        # Expand since t2t expects 4d tensors.
        return tf.expand_dims(decoder_output, axis=2)
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True,
                        losses=None):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses

  Returns:
    y: a Tensors
  """
  x = encoder_input
  attention_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "attention_dropout_broadcast_dims", "")))
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
      value=hparams.num_encoder_layers or hparams.num_hidden_layers)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
      value=hparams.attention_dropout)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
      value={
          "use_bias": "false",
          "num_heads": hparams.num_heads,
          "hidden_size": hparams.hidden_size
      })

  with tf.variable_scope(name):
    if nonpadding is not None:
      padding = 1.0 - nonpadding
    else:
      padding = common_attention.attention_bias_to_padding(
          encoder_self_attention_bias)
      nonpadding = 1.0 - padding
    pad_remover = None
    if hparams.use_pad_remover and not common_layers.is_xla_compiled():
      pad_remover = expert_utils.PadRemover(padding)
    for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        with tf.variable_scope("self_attention"):
          y = common_attention.multihead_attention(
              common_layers.layer_preprocess(x, hparams),
              None,
              encoder_self_attention_bias,
              hparams.attention_key_channels or hparams.hidden_size,
              hparams.attention_value_channels or hparams.hidden_size,
              hparams.hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.self_attention_type,
              max_relative_position=hparams.max_relative_position,
              heads_share_relative_embedding=(
                  hparams.heads_share_relative_embedding),
              add_relative_to_values=hparams.add_relative_to_values,
              save_weights_to=save_weights_to,
              make_image_summary=make_image_summary,
              dropout_broadcast_dims=attention_dropout_broadcast_dims,
              max_length=hparams.get("max_length"),
              vars_3d=hparams.get("attention_variables_3d"))
          x = common_layers.layer_postprocess(x, y, hparams)
        with tf.variable_scope("ffn"):
          y = transformer_ffn_layer(
              common_layers.layer_preprocess(x, hparams),
              hparams,
              pad_remover,
              conv_padding="SAME",
              nonpadding_mask=nonpadding,
              losses=losses)
          x = common_layers.layer_postprocess(x, y, hparams)
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NORM,
        value={"hidden_size": hparams.hidden_size})
    return common_layers.layer_preprocess(x, hparams)
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None,
                          losses=None,
                          cache=None,
                          decode_loop_step=None,
                          readout_filter_size=0):
  """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutional layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.
    losses: optional list onto which to append extra training losses
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop.
        Only used for inference on TPU.
    readout_filter_size: if it's greater than 0, then it will be used instead of
      filter_size


  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
  ffn_layer = hparams.ffn_layer
  relu_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "relu_dropout_broadcast_dims", "")))
  if ffn_layer == "conv_hidden_relu":
    # Backwards compatibility
    ffn_layer = "dense_relu_dense"
  if ffn_layer == "dense_relu_dense":
    # In simple convolution mode, use `pad_remover` to speed up processing.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE,
        value={
            "filter_size": hparams.filter_size,
            "use_bias": "True",
            "activation": mlperf_log.RELU
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE,
        value={
            "hidden_size": hparams.hidden_size,
            "use_bias": "True",
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout)
    if pad_remover:
      original_shape = common_layers.shape_list(x)
      # Collapse `x` across examples, and remove padding positions.
      x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
      x = tf.expand_dims(pad_remover.remove(x), axis=0)
    conv_output = common_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims)
    if pad_remover:
      # Restore `conv_output` to the original shape of `x`, including padding.
      conv_output = tf.reshape(
          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
    return conv_output
  elif ffn_layer == "conv_relu_conv":
    return common_layers.conv_relu_conv(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        first_kernel_size=hparams.conv_first_kernel,
        second_kernel_size=1,
        padding=conv_padding,
        nonpadding_mask=nonpadding_mask,
        dropout=hparams.relu_dropout,
        cache=cache,
        decode_loop_step=decode_loop_step)
  elif ffn_layer == "parameter_attention":
    return common_attention.parameter_attention(
        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
        hparams.parameter_attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, readout_filter_size or hparams.filter_size,
        hparams.num_heads,
        hparams.attention_dropout)
  elif ffn_layer == "conv_hidden_relu_with_sepconv":
    return common_layers.conv_hidden_relu(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        kernel_size=(3, 1),
        second_kernel_size=(31, 1),
        padding="LEFT",
        dropout=hparams.relu_dropout)
  elif ffn_layer == "sru":
    return common_layers.sru(x)
  elif ffn_layer == "local_moe_tpu":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe_tpu(
        x,
        hparams.filter_size // 2,
        hparams.hidden_size,
        hparams.moe_num_experts,
        overhead=overhead,
        loss_coef=hparams.moe_loss_coef)
  elif ffn_layer == "local_moe":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe(
        x,
        True,
        expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size],
                                   hparams.hidden_size),
        hparams.moe_num_experts,
        k=hparams.moe_k,
        hparams=hparams)
    losses.append(loss)
    return ret
  else:
    assert ffn_layer == "none"
    return x
Exemple #26
0
def main(argv):
  tf.logging.set_verbosity(tf.logging.INFO)

  if FLAGS.jax:
    # Setup trax FLAGS
    dataset = FLAGS.problem
    model = FLAGS.model
    data_dir = FLAGS.data_dir
    output_dir = FLAGS.output_dir
    config_file = [FLAGS.hparams_set]
    config = [
        "train.train_steps=%d" % FLAGS.train_steps,
        "train.eval_steps=%d" % FLAGS.eval_steps,
        "train.eval_frequency=%d" % FLAGS.local_eval_frequency,
    ] + str(FLAGS.hparams).split(",")

    # Copied _setup_gin exactly from trax/trainer.py and removed "FLAGS."

    def _setup_gin():
      """Setup gin configuration."""
      # Imports for configurables
      # pylint: disable=g-import-not-at-top,unused-import,g-bad-import-order,reimported,unused-variable
      from tensor2tensor.trax import inputs as _trax_inputs
      from tensor2tensor.trax import models as _trax_models
      from tensor2tensor.trax import optimizers as _trax_opt
      # pylint: disable=g-import-not-at-top,unused-import,g-bad-import-order,reimported,unused-variable

      configs = config or []
      # Override with --dataset and --model
      if dataset:
        configs.append("inputs.dataset_name='%s'" % dataset)
        configs.append("inputs.data_dir='%s'" % data_dir)
        configs.append("[email protected]")
      if model:
        configs.append("[email protected].%s" % model)
      gin.parse_config_files_and_bindings(config_file, configs)

    _setup_gin()
    trax.train(output_dir=output_dir)

  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

  # If we just have to print the registry, do that and exit early.
  maybe_log_registry_and_exit()

  # Create HParams.
  if argv:
    set_hparams_from_args(argv[1:])
  hparams = create_hparams()

  if FLAGS.schedule == "train" or FLAGS.schedule == "train_eval_and_decode":
    mlperf_log.transformer_print(key=mlperf_log.RUN_START, hparams=hparams)
  if FLAGS.schedule == "run_std_server":
    run_std_server()
  mlperf_log.transformer_print(
      key=mlperf_log.RUN_SET_RANDOM_SEED, value=FLAGS.random_seed,
      hparams=hparams)
  trainer_lib.set_random_seed(FLAGS.random_seed)

  if FLAGS.cloud_mlengine:
    cloud_mlengine.launch()
    return

  if FLAGS.generate_data:
    generate_data()

  if cloud_mlengine.job_dir():
    FLAGS.output_dir = cloud_mlengine.job_dir()

  exp_fn = create_experiment_fn()
  exp = exp_fn(create_run_config(hparams), hparams)
  if is_chief():
    save_metadata(hparams)
  execute_schedule(exp)
  if FLAGS.schedule != "train":
    mlperf_log.transformer_print(key=mlperf_log.RUN_FINAL,
                                 hparams=hparams)
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True,
                        losses=None,
                        attn_bias_for_padding=None):
    """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses
    attn_bias_for_padding: Padded attention bias in case a unidirectional
      encoder is being used where future attention is masked.

  Returns:
    y: a Tensors
  """
    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                 value=hparams.num_encoder_layers
                                 or hparams.num_hidden_layers)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
                                 value=hparams.attention_dropout)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
                                 value={
                                     "use_bias": "false",
                                     "num_heads": hparams.num_heads,
                                     "hidden_size": hparams.hidden_size
                                 })

    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            attention_bias = encoder_self_attention_bias
            if attn_bias_for_padding is not None:
                attention_bias = attn_bias_for_padding
            padding = common_attention.attention_bias_to_padding(
                attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            pad_remover = expert_utils.PadRemover(padding)
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    if layer < hparams.get("num_area_layers", 0):
                        max_area_width = hparams.get("max_area_width", 1)
                        max_area_height = hparams.get("max_area_height", 1)
                        memory_height = hparams.get("memory_height", 1)
                    else:
                        max_area_width = 1
                        max_area_height = 1
                        memory_height = 1
                    y = common_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        encoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"),
                        activation_dtype=hparams.get("activation_dtype",
                                                     "float32"),
                        weight_dtype=hparams.get("weight_dtype", "float32"),
                        hard_attention_k=hparams.get("hard_attention_k", 0),
                        gumbel_noise_weight=hparams.get(
                            "gumbel_noise_weight", 0.0),
                        max_area_width=max_area_width,
                        max_area_height=max_area_height,
                        memory_height=memory_height,
                        area_key_mode=hparams.get("area_key_mode", "none"),
                        area_value_mode=hparams.get("area_value_mode", "none"),
                        training=(hparams.get("mode",
                                              tf.estimator.ModeKeys.TRAIN) ==
                                  tf.estimator.ModeKeys.TRAIN))
                    x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(common_layers.layer_preprocess(
                        x, hparams),
                                              hparams,
                                              pad_remover,
                                              conv_padding="SAME",
                                              nonpadding_mask=nonpadding,
                                              losses=losses)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_NORM,
            value={"hidden_size": hparams.hidden_size})
        return common_layers.layer_preprocess(x, hparams)
Exemple #28
0
  def __init__(self, optimizer_name, lr, hparams, use_tpu=False):  # pylint: disable=super-init-not-called
    tf.logging.info("Using optimizer %s", optimizer_name)

    mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                 value=optimizer_name,
                                 hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_BETA1, value=hparams.optimizer_adam_beta1,
        hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_BETA2, value=hparams.optimizer_adam_beta2,
        hparams=hparams)
    mlperf_log.transformer_print(
        key=mlperf_log.OPT_HP_ADAM_EPSILON,
        value=hparams.optimizer_adam_epsilon,
        hparams=hparams)

    if optimizer_name == "Adam":
      # We change the default epsilon for Adam.
      # Using LazyAdam as it's much faster for large vocabulary embeddings.
      self._opt = tf.contrib.opt.LazyAdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "MultistepAdam":
      self._opt = multistep_optimizer.MultistepAdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon,
          n=hparams.optimizer_multistep_accumulate_steps)
    elif optimizer_name == "Momentum":
      self._opt = tf.train.MomentumOptimizer(
          lr,
          momentum=hparams.optimizer_momentum_momentum,
          use_nesterov=hparams.optimizer_momentum_nesterov)
    elif optimizer_name == "YellowFin":
      self._opt = yellowfin.YellowFinOptimizer(
          learning_rate=lr, momentum=hparams.optimizer_momentum_momentum)
    elif optimizer_name == "TrueAdam":
      self._opt = tf.train.AdamOptimizer(
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "AdamW":
      # Openai gpt used weight decay.
      # Given the internals of AdamW, weight decay dependent on the
      # learning rate is chosen to match the openai implementation.
      # The weight decay update to each parameter is applied before the adam
      # gradients computation, which is different from that described
      # in the paper and in the openai implementation:
      # https://arxiv.org/pdf/1711.05101.pdf
      self._opt = tf.contrib.opt.AdamWOptimizer(
          0.01*lr,
          lr,
          beta1=hparams.optimizer_adam_beta1,
          beta2=hparams.optimizer_adam_beta2,
          epsilon=hparams.optimizer_adam_epsilon)
    elif optimizer_name == "Adafactor":
      self._opt = adafactor.adafactor_optimizer_from_hparams(hparams, lr)
    else:
      self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
Exemple #29
0
  def dataset(self,
              mode,
              data_dir=None,
              num_threads=None,
              output_buffer_size=None,
              shuffle_files=None,
              hparams=None,
              preprocess=True,
              dataset_split=None,
              shard=None,
              partition_id=0,
              num_partitions=1,
              shuffle_buffer_size=1024,
              max_records=-1):
    """Build a Dataset for this problem.

    Args:
      mode: tf.estimator.ModeKeys; determines which files to read from.
      data_dir: directory that contains data files.
      num_threads: int, number of threads to use for decode and preprocess
        Dataset.map calls.
      output_buffer_size: int, how many elements to prefetch at end of pipeline.
      shuffle_files: whether to shuffle input files. Default behavior (i.e. when
        shuffle_files=None) is to shuffle if mode == TRAIN.
      hparams: HParams; hparams to be passed to
        Problem.preprocess_example and Problem.hparams. If None, will use a
        default set that is a no-op.
      preprocess: bool, whether to map the Dataset through
        Problem.preprocess_example.
      dataset_split: DatasetSplit, which split to read data
        from (TRAIN:"-train", EVAL:"-dev", "test":"-test"). Defaults to mode.
      shard: int, if provided, will only read data from the specified shard.
      partition_id: integer - which partition of the dataset to read from
      num_partitions: how many partitions in the dataset
      shuffle_buffer_size: if shuffle_files is True, this is the buffer size
        used to shuffle records.
      max_records: int, number of records to truncate to.

    Returns:
      Dataset containing dict<feature name, Tensor>.

    Raises:
      ValueError: if num_partitions is greater than the number of data files.
    """
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    shuffle_files = shuffle_files or shuffle_files is None and is_training

    dataset_split = dataset_split or mode
    assert data_dir

    if hparams is None:
      hparams = default_model_hparams()

    if not hasattr(hparams, "data_dir"):
      hparams.add_hparam("data_dir", data_dir)
    if not hparams.data_dir:
      hparams.data_dir = data_dir
    # Construct the Problem's hparams so that items within it are accessible
    _ = self.get_hparams(hparams)

    data_filepattern = self.filepattern(data_dir, dataset_split, shard=shard)
    tf.logging.info("Reading data files from %s", data_filepattern)
    data_files = sorted(slim.parallel_reader.get_data_files(
        data_filepattern))

    # Functions used in dataset transforms below. `filenames` can be either a
    # `tf.string` tensor or `tf.data.Dataset` containing one or more filenames.
    def _load_records_and_preprocess(filenames):
      """Reads files from a string tensor or a dataset of filenames."""
      # Load records from file(s) with an 8MiB read buffer.
      dataset = tf.data.TFRecordDataset(filenames, buffer_size=8 * 1024 * 1024)
      # Decode.
      dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
      # Preprocess if requested.
      # Note that preprocessing should happen per-file as order may matter.
      if preprocess:
        dataset = self.preprocess(dataset, mode, hparams,
                                  interleave=shuffle_files)
      return dataset

    if len(data_files) < num_partitions:
      raise ValueError(
          "number of data files (%d) must be at least the number of hosts (%d)"
          % (len(data_files), num_partitions))
    data_files = [f for (i, f) in enumerate(data_files)
                  if i % num_partitions == partition_id]
    tf.logging.info(
        "partition: %d num_data_files: %d" % (partition_id, len(data_files)))
    if shuffle_files:
      mlperf_log.transformer_print(key=mlperf_log.INPUT_ORDER)
      random.shuffle(data_files)

    dataset = tf.data.Dataset.from_tensor_slices(tf.constant(data_files))
    # Create data-set from files by parsing, pre-processing and interleaving.
    if shuffle_files:
      dataset = dataset.apply(
          tf.data.experimental.parallel_interleave(
              _load_records_and_preprocess, sloppy=True, cycle_length=8))
    else:
      dataset = _load_records_and_preprocess(dataset)

    dataset = dataset.map(
        self.maybe_reverse_and_copy, num_parallel_calls=num_threads)
    dataset = dataset.take(max_records)

    ## Shuffle records only for training examples.
    if shuffle_files and is_training:
      dataset = dataset.shuffle(shuffle_buffer_size)
    if hparams.get("pack_dataset", False):
      dataset = generator_utils.pack_dataset(
          dataset, hparams.max_length, keys=["inputs", "targets"],
          use_custom_ops=hparams.get("use_custom_ops", False))
    if output_buffer_size:
      dataset = dataset.prefetch(output_buffer_size)

    return dataset
Exemple #30
0
def moe_transformer_decoder(decoder_input,
                            encoder_output,
                            decoder_self_attention_bias,
                            encoder_decoder_attention_bias,
                            hparams,
                            cache=None,
                            decode_loop_step=None,
                            name="decoder",
                            save_weights_to=None,
                            make_image_summary=True,
                            layer_collection=None,
                            recurrent_memory_by_layer=None,
                            chunk_number=None):
  """A stack of transformer layers.
  Args:
    decoder_input: a Tensor
    encoder_output: a Tensor
    decoder_self_attention_bias: bias Tensor for self-attention (see
      common_attention.attention_bias())
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    cache: dict, containing tensors which are the results of previous
      attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop. Only used
      for inference on TPU.
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This is used to mask out
      padding in convolutional layers.  We generally only need this mask for
      "packed" datasets, because for ordinary datasets, no padding is ever
      followed by nonpadding.
    save_weights_to: an optional dictionary to capture attention weights for
      visualization; the weights tensor will be appended there under a string
      key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses
    layer_collection: A tensorflow_kfac.LayerCollection. Only used by the KFAC
      optimizer. Default is None.
    recurrent_memory_by_layer: Optional dict, mapping layer names to instances
      of transformer_memory.RecurrentMemory. Default is None.
    chunk_number: an optional integer Tensor with shape [batch] used to operate
      the recurrent_memory.
  Returns:
    y: a Tensors
  """
  x = decoder_input

  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
      value=hparams.num_decoder_layers or hparams.num_hidden_layers)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
      value=hparams.attention_dropout)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
      value={
          "use_bias": "false",
          "num_heads": hparams.num_heads,
          "hidden_size": hparams.hidden_size
      })

  with tf.variable_scope(name):
    for layer_idx in range(hparams.num_decoder_layers or
                           hparams.num_hidden_layers):
      x = moe_transformer_decoder_layer(
          x,
          decoder_self_attention_bias,
          layer_idx,
          hparams,
          encoder_decoder_attention_bias=encoder_decoder_attention_bias,
          encoder_output=encoder_output,
          cache=cache,
          decode_loop_step=decode_loop_step,
          save_weights_to=save_weights_to,
          make_image_summary=make_image_summary,
          layer_collection=layer_collection,
          recurrent_memory_by_layer=recurrent_memory_by_layer,
          chunk_number=chunk_number
          )

    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NORM,
        value={"hidden_size": hparams.hidden_size})
    return common_layers.layer_preprocess(
        x, hparams)
def mel_perf_transformer_encode(encoder_function,
                                perf_inputs,
                                mel_inputs,
                                target_space,
                                hparams,
                                attention_weights=None,
                                features=None,
                                losses=None,
                                prepare_encoder_fn=None,
                                **kwargs):
    """Encode transformer inputs. Used for melody & performance autoencoder.

  Performance is mean-aggregated across time and combined with melody in a
  variety of different ways.

  Args:
    encoder_function: the encoder function
    perf_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim]
    which will be flattened along the two spatial dimensions.
    mel_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim]
    which will be flattened along the two spatial dimensions.
    target_space: scalar, target space ID.
    hparams: hyperparameters for model.
    attention_weights: weight to store attention to.
    features: optionally pass the entire features dictionary as well. This is
      needed now for "packed" datasets.
    losses: optional list onto which to append extra training losses
    prepare_encoder_fn: optional, alternative to transformer_prepare_encoder.
    **kwargs: additional arguments to pass to encoder_function

  Returns:
    Tuple of:
        encoder_output: Encoder representation.
            [batch_size, input_length, hidden_dim]
        encoder_decoder_attention_bias: Bias and mask weights for
            encoder-decoder attention. [batch_size, input_length]
  """
    perf_inputs = common_layers.flatten4d3d(perf_inputs)
    mel_inputs = common_layers.flatten4d3d(mel_inputs)

    if not prepare_encoder_fn:
        prepare_encoder_fn = transformer_prepare_encoder
    perf_encoder_input, perf_self_attention_bias, perf_encdec_attention_bias = (
        prepare_encoder_fn(perf_inputs,
                           target_space,
                           hparams,
                           features=features,
                           reuse_target_embedding=tf.AUTO_REUSE))

    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)

    perf_encoder_input = tf.nn.dropout(
        perf_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout)

    perf_attn_bias_for_padding = None
    # Otherwise the encoder will just use encoder_self_attention_bias.
    if hparams.unidirectional_encoder:
        perf_attn_bias_for_padding = perf_encdec_attention_bias

    # do the same thing for melody
    mel_encoder_input, mel_self_attention_bias, mel_encdec_attention_bias = (
        prepare_encoder_fn(mel_inputs,
                           target_space,
                           hparams,
                           features=features,
                           reuse_target_embedding=tf.AUTO_REUSE))

    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)

    mel_encoder_input = tf.nn.dropout(
        mel_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout)

    mel_attn_bias_for_padding = None
    # Otherwise the encoder will just use encoder_self_attention_bias.
    if hparams.unidirectional_encoder:
        mel_attn_bias_for_padding = mel_encdec_attention_bias

    # use the proper encoder function for perf/melody
    perf_encoder_output = encoder_function(
        perf_encoder_input,
        perf_self_attention_bias,
        hparams,
        name="perf_encoder",
        nonpadding=features_to_nonpadding(features, "inputs"),
        save_weights_to=attention_weights,
        make_image_summary=not common_layers.is_xla_compiled(),
        losses=losses,
        attn_bias_for_padding=perf_attn_bias_for_padding,
        **kwargs)
    # same thing for melody
    mel_encoder_output = encoder_function(
        mel_encoder_input,
        mel_self_attention_bias,
        hparams,
        name="mel_encoder",
        nonpadding=features_to_nonpadding(features, "inputs"),
        save_weights_to=attention_weights,
        make_image_summary=not common_layers.is_xla_compiled(),
        losses=losses,
        attn_bias_for_padding=mel_attn_bias_for_padding,
        **kwargs)

    # concatenate the global mean vector/bias term with the full melody encoding
    perf_mean_vector = tf.math.reduce_mean(perf_encoder_output,
                                           axis=1,
                                           keep_dims=True)

    # different methods of aggregating over the performance + melody vectors!
    if hparams.aggregation == "sum":
        # add both mean performance and melody vectors together
        perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias,
                                             axis=-1,
                                             keep_dims=True)
        encoder_output = mel_encoder_output + perf_mean_vector
        encoder_decoder_attention_bias = mel_encdec_attention_bias + perf_mean_bias
    elif hparams.aggregation == "concat":
        # concatenate melody with mean-aggregated performance embedding
        stop_token = tf.zeros((1, 1, 384))
        encoder_output = tf.concat(
            [mel_encoder_output, stop_token, perf_mean_vector], axis=1)
        perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias,
                                             axis=-1,
                                             keep_dims=True)
        stop_bias = tf.zeros((1, 1, 1, 1))
        encoder_decoder_attention_bias = tf.concat(
            [mel_encdec_attention_bias, stop_bias, perf_mean_bias], axis=-1)
    elif hparams.aggregation == "tile":
        # tile performance embedding across each dimension of melody embedding!
        dynamic_val = tf.shape(mel_encoder_output)[1]
        shp = tf.convert_to_tensor([1, dynamic_val, 1], dtype=tf.int32)
        tiled_mean = tf.tile(perf_mean_vector, shp)

        encoder_output = tf.concat([mel_encoder_output, tiled_mean], axis=-1)
        encoder_decoder_attention_bias = mel_encdec_attention_bias
    else:
        NotImplementedError(
            "aggregation method must be in [sum, concat, tile].")

    return encoder_output, encoder_decoder_attention_bias
Exemple #32
0
def generate_files(generator,
                   output_filenames,
                   max_cases=None,
                   cycle_every_n=1):
    """Generate cases from a generator and save as TFRecord files.

  Generated cases are transformed to tf.Example protos and saved as TFRecords
  in sharded files named output_dir/output_name-00..N-of-00..M=num_shards.

  Args:
    generator: a generator yielding (string -> int/float/str list) dictionaries.
    output_filenames: List of output file paths.
    max_cases: maximum number of cases to get from the generator;
      if None (default), we use the generator until StopIteration is raised.
    cycle_every_n: how many cases from the generator to take before
      switching to the next shard; by default set to 1, switch every case.
  """
    if outputs_exist(output_filenames):
        tf.logging.info(
            "Skipping generator because outputs files exists at {}".format(
                output_filenames))
        return
    tmp_filenames = [fname + ".incomplete" for fname in output_filenames]
    num_shards = len(output_filenames)
    # Check if is training or eval, ref: train_data_filenames().
    if num_shards > 0:
        if "-train" in output_filenames[0]:
            tag = "train"
        elif "-dev" in output_filenames[0]:
            tag = "eval"
        else:
            tag = "other"

    writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filenames]
    counter, shard = 0, 0
    for case in generator:
        if case is None:
            continue
        if counter % 100000 == 0:
            tf.logging.info("Generating case %d." % counter)
        counter += 1
        if max_cases and counter > max_cases:
            break
        example = to_example(case)
        writers[shard].write(example.SerializeToString())
        if counter % cycle_every_n == 0:
            shard = (shard + 1) % num_shards

    for writer in writers:
        writer.close()

    for tmp_name, final_name in zip(tmp_filenames, output_filenames):
        tf.gfile.Rename(tmp_name, final_name)

    if num_shards > 0:
        if tag == "train":
            mlperf_log.transformer_print(
                key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=counter)
        elif tag == "eval":
            mlperf_log.transformer_print(
                key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=counter)

    tf.logging.info("Generated %s Examples", counter)
def perf_transformer_encode(encoder_function,
                            inputs,
                            target_space,
                            hparams,
                            baseline,
                            attention_weights=None,
                            features=None,
                            losses=None,
                            prepare_encoder_fn=None,
                            **kwargs):
    """Encoding for performance autoencoder, which mean-aggregates across time.

  Args:
    encoder_function: the encoder function
    inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which
      will be flattened along the two spatial dimensions.
    target_space: scalar, target space ID.
    hparams: hyperparameters for model.
    baseline: if True, does not mean-aggregate the encoder output.
    attention_weights: weight to store attention to.
    features: optionally pass the entire features dictionary as well. This is
      needed now for "packed" datasets.
    losses: optional list onto which to append extra training losses
    prepare_encoder_fn: optional, alternative to transformer_prepare_encoder.
    **kwargs: additional arguments to pass to encoder_function

  Returns:
    Tuple of:
        encoder_output: Encoder representation.
            [batch_size, input_length, hidden_dim]
        encoder_decoder_attention_bias: Bias and mask weights for
            encoder-decoder attention. [batch_size, input_length]
  """
    inputs = common_layers.flatten4d3d(inputs)

    if not prepare_encoder_fn:
        prepare_encoder_fn = transformer_prepare_encoder
    encoder_input, self_attention_bias, encoder_decoder_attention_bias = (
        prepare_encoder_fn(inputs,
                           target_space,
                           hparams,
                           features=features,
                           reuse_target_embedding=tf.AUTO_REUSE))

    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    attn_bias_for_padding = None
    # Otherwise the encoder will just use encoder_self_attention_bias.
    if hparams.unidirectional_encoder:
        attn_bias_for_padding = encoder_decoder_attention_bias

    encoder_output = encoder_function(
        encoder_input,
        self_attention_bias,
        hparams,
        name="encoder",
        nonpadding=features_to_nonpadding(features, "inputs"),
        save_weights_to=attention_weights,
        make_image_summary=not common_layers.is_xla_compiled(),
        losses=losses,
        attn_bias_for_padding=attn_bias_for_padding,
        **kwargs)

    if not baseline:
        encoder_output = tf.math.reduce_mean(encoder_output,
                                             axis=1,
                                             keep_dims=True)
        encoder_decoder_attention_bias = tf.math.reduce_mean(
            encoder_decoder_attention_bias, axis=-1, keep_dims=True)

    return encoder_output, encoder_decoder_attention_bias
Exemple #34
0
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None,
                          losses=None,
                          cache=None,
                          decode_loop_step=None,
                          readout_filter_size=0):
  """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutional layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.
    losses: optional list onto which to append extra training losses
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop.
        Only used for inference on TPU.
    readout_filter_size: if it's greater than 0, then it will be used instead of
      filter_size


  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
  ffn_layer = hparams.ffn_layer
  relu_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "relu_dropout_broadcast_dims", "")))
  if ffn_layer == "conv_hidden_relu":
    # Backwards compatibility
    ffn_layer = "dense_relu_dense"
  if ffn_layer == "dense_relu_dense":
    # In simple convolution mode, use `pad_remover` to speed up processing.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE,
        value={
            "filter_size": hparams.filter_size,
            "use_bias": "True",
            "activation": mlperf_log.RELU
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE,
        value={
            "hidden_size": hparams.hidden_size,
            "use_bias": "True",
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout)
    if pad_remover:
      original_shape = common_layers.shape_list(x)
      # Collapse `x` across examples, and remove padding positions.
      x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
      x = tf.expand_dims(pad_remover.remove(x), axis=0)
    conv_output = common_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims)
    if pad_remover:
      # Restore `conv_output` to the original shape of `x`, including padding.
      conv_output = tf.reshape(
          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
    return conv_output
  elif ffn_layer == "conv_relu_conv":
    return common_layers.conv_relu_conv(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        first_kernel_size=hparams.conv_first_kernel,
        second_kernel_size=1,
        padding=conv_padding,
        nonpadding_mask=nonpadding_mask,
        dropout=hparams.relu_dropout,
        cache=cache,
        decode_loop_step=decode_loop_step)
  elif ffn_layer == "parameter_attention":
    return common_attention.parameter_attention(
        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
        hparams.parameter_attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, readout_filter_size or hparams.filter_size,
        hparams.num_heads,
        hparams.attention_dropout)
  elif ffn_layer == "conv_hidden_relu_with_sepconv":
    return common_layers.conv_hidden_relu(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        kernel_size=(3, 1),
        second_kernel_size=(31, 1),
        padding="LEFT",
        dropout=hparams.relu_dropout)
  elif ffn_layer == "sru":
    return common_layers.sru(x)
  elif ffn_layer == "local_moe_tpu":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe_tpu(
        x,
        hparams.filter_size // 2,
        hparams.hidden_size,
        hparams.moe_num_experts,
        overhead=overhead,
        loss_coef=hparams.moe_loss_coef)
  elif ffn_layer == "local_moe":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe(
        x,
        True,
        expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size],
                                   hparams.hidden_size),
        hparams.moe_num_experts,
        k=hparams.moe_k,
        hparams=hparams)
    losses.append(loss)
    return ret
  else:
    assert ffn_layer == "none"
    return x
Exemple #35
0
def input_fn(dataset,
             filepattern,
             skip_random_fraction_when_training,
             batch_size_means_tokens_param,
             batch_size_multiplier,
             max_length,
             mode,
             hparams,
             data_dir=None,
             params=None,
             config=None,
             force_repeat=False,
             prevent_repeat=False):
    """Builds input pipeline for problem.

  Args:
    dataset: the dataset to make input function from.
    filepattern: the pattern of files to read from.
    skip_random_fraction_when_training: whether to skip randomly when training.
    batch_size_means_tokens_param: whether batch size should mean tokens.
    batch_size_multiplier: how to multiply batch size when bucketing.
    max_length: maximum length,
    mode: tf.estimator.ModeKeys
    hparams: HParams, model hparams
    data_dir: str, data directory; if None, will use hparams.data_dir
    params: dict, may include "batch_size"
    config: RunConfig; should have the data_parallelism attribute if not using
      TPU
    force_repeat: bool, whether to repeat the data even if not training
    prevent_repeat: bool, whether to not repeat when in training mode.
      Overrides force_repeat.

  Returns:
    (features_dict<str name, Tensor feature>, Tensor targets)
  """
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    if config and config.use_tpu:
        num_threads = 64
    else:
        num_threads = cpu_count() if is_training else 1

    if config and hasattr(config,
                          "data_parallelism") and config.data_parallelism:
        num_shards = config.data_parallelism.n
    else:
        num_shards = 1

    mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                 value=max_length)

    def tpu_valid_size(example):
        return example_valid_size(example, hparams.min_length, max_length)

    def gpu_valid_size(example):
        drop_long_sequences = is_training or hparams.eval_drop_long_sequences
        max_validate_length = max_length if drop_long_sequences else 10**9
        return example_valid_size(example, hparams.min_length,
                                  max_validate_length)

    def define_shapes(example):
        batch_size = config and config.use_tpu and params["batch_size"]
        return standardize_shapes(example, batch_size=batch_size)

    # Read and preprocess
    data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir)

    if (force_repeat or is_training) and not prevent_repeat:
        # Repeat and skip a random number of records
        dataset = dataset.repeat()

    if is_training and skip_random_fraction_when_training:
        data_files = contrib.slim().parallel_reader.get_data_files(filepattern)
        #  In continuous_train_and_eval when switching between train and
        #  eval, this input_fn method gets called multiple times and it
        #  would give you the exact same samples from the last call
        #  (because the Graph seed is set). So this skip gives you some
        #  shuffling.
        dataset = skip_random_fraction(dataset, data_files[0])

    dataset = dataset.map(cast_ints_to_int32, num_parallel_calls=num_threads)

    if batch_size_means_tokens_param:
        batch_size_means_tokens = True
    else:
        if _are_shapes_fully_defined(dataset.output_shapes):
            batch_size_means_tokens = False
        else:
            tf.logging.warning(
                "Shapes are not fully defined. Assuming batch_size means tokens."
            )
            batch_size_means_tokens = True

    # Batching
    if not batch_size_means_tokens:
        # Batch size means examples per datashard.
        if config and config.use_tpu:
            # on TPU, we use params["batch_size"], which specifies the number of
            # examples across all datashards
            batch_size = params["batch_size"]
            dataset = dataset.batch(batch_size, drop_remainder=True)
        else:
            batch_size = hparams.batch_size * num_shards
            dataset = dataset.batch(batch_size)
    else:
        # batch_size means tokens per datashard
        if config and config.use_tpu:
            dataset = dataset.filter(tpu_valid_size)
            padded_shapes = pad_for_tpu(dataset.output_shapes, hparams,
                                        max_length)
            # on TPU, we use params["batch_size"], which specifies the number of
            # examples across all datashards
            batch_size = params["batch_size"]
            if hparams.pad_batch:
                tf.logging.warn(
                    "Padding the batch to ensure that remainder eval batches are "
                    "processed. This may lead to incorrect metrics for "
                    "non-zero-padded features, e.g. images. Use a smaller batch "
                    "size that has no remainder in that case.")
                dataset = dataset.padded_batch(batch_size,
                                               padded_shapes,
                                               drop_remainder=False)
                dataset = dataset.map(functools.partial(
                    pad_batch, batch_multiple=batch_size),
                                      num_parallel_calls=num_threads)
            else:
                dataset = dataset.padded_batch(batch_size,
                                               padded_shapes,
                                               drop_remainder=True)
        else:
            # On GPU, bucket by length
            dataset = dataset.filter(gpu_valid_size)
            cur_batching_scheme = hparams_to_batching_scheme(
                hparams,
                shard_multiplier=num_shards,
                length_multiplier=batch_size_multiplier)
            if hparams.use_fixed_batch_size:
                # Here  batch_size really means examples per datashard.
                cur_batching_scheme["batch_sizes"] = [hparams.batch_size]
                cur_batching_scheme["boundaries"] = []
            dataset = dataset.apply(
                tf.data.experimental.bucket_by_sequence_length(
                    example_length, cur_batching_scheme["boundaries"],
                    cur_batching_scheme["batch_sizes"]))

            if not is_training:
                batch_multiple = num_shards
                if hparams.use_fixed_batch_size:
                    # Make sure the last batch has the same fixed size as the rest.
                    batch_multiple *= hparams.batch_size
                if batch_multiple > 1:
                    tf.logging.warn(
                        "Padding the batch to ensure that remainder eval batches have "
                        "a batch size divisible by the number of data shards. This may "
                        "lead to incorrect metrics for non-zero-padded features, e.g. "
                        "images. Use a single datashard (i.e. 1 GPU) in that case."
                    )
                    dataset = dataset.map(functools.partial(
                        pad_batch, batch_multiple=batch_multiple),
                                          num_parallel_calls=num_threads)

    dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

    # Add shuffling for training batches. This is necessary along with record
    # level shuffling in the dataset generation. Record shuffling will shuffle
    # the examples. However, in some cases, it's possible that the shuffle
    # buffer size for record shuffling is smaller than the batch size. In such
    # cases, adding batch shuffling ensures that the data is in random order
    # during training
    if (is_training and hasattr(hparams, "batch_shuffle_size")
            and hparams.batch_shuffle_size):
        dataset = dataset.shuffle(hparams.batch_shuffle_size)

    # Split batches into chunks if targets are too long.
    # The new "chunk_number" feature is 0 for the first chunk and goes up then.
    # Chunks are reversed so the 0th chunk comes first, then the 1st and so on,
    # so models can attend to them in the order they arrive. The last chunk is
    # usually the one containing the end of the target sentence (EOS).
    chunk_length = hparams.get("split_targets_chunk_length", 0)
    max_chunks = hparams.get("split_targets_max_chunks", 100)
    if chunk_length > 0:

        def is_nonzero_chunk(example):
            """A chunk is zero if all targets are 0s."""
            return tf.less(0, tf.reduce_sum(tf.abs(example["targets"])))

        def split_on_length(example):
            """Split a batch of ditcs on length."""
            x = example["targets"]
            # TODO(kitaev): This code breaks if chunk_length * max_chunks < batch_size
            length_diff = chunk_length * max_chunks - tf.shape(x)[1]
            padded_x = tf.pad(x, [(0, 0), (0, length_diff), (0, 0), (0, 0)])
            chunks = [
                padded_x[:, i * chunk_length:(i + 1) * chunk_length, :, :]
                for i in range(max_chunks - 1)
            ]
            chunks.append(padded_x[:, (max_chunks - 1) * chunk_length:, :, :])
            new_example = {}
            # Setting chunk_number to be tf.range(max_chunks) is incompatible with TPU
            new_example["chunk_number"] = tf.concat([
                tf.expand_dims(tf.ones_like(c) * n, axis=0)
                for n, c in enumerate(chunks)
            ],
                                                    axis=0)
            new_example["targets"] = tf.concat(
                [tf.expand_dims(c, axis=0) for c in chunks], axis=0)
            for k in example:
                if k != "targets":
                    assert k != "chunk_number", (
                        "Chunking code expects the chunk_number feature name to be "
                        "available")
                    new_example[k] = tf.concat([
                        tf.expand_dims(example[k], axis=0)
                        for _ in range(max_chunks)
                    ],
                                               axis=0)
            return tf.data.Dataset.from_tensor_slices(new_example)

        dataset = dataset.flat_map(split_on_length)
        dataset = dataset.filter(is_nonzero_chunk)

        # The chunking data pipeline thus far creates batches of examples where all
        # of the examples have the same chunk number. This can lead to periodic
        # fluctuations in the loss; for example, when all examples in the batch have
        # chunk number 0 the loss may be higher than midway through a sequence.
        # Enabling split_targets_strided_training adjusts the data so that each
        # batch includes examples at various points within a sequence.
        if is_training and hparams.split_targets_strided_training:
            # TODO(kitaev): make sure that shape inference works on GPU, not just TPU.
            inferred_batch_size = dataset.output_shapes["targets"].as_list()[0]
            if inferred_batch_size is None:
                raise ValueError(
                    "Strided training is only implemented when the batch size can be "
                    "inferred statically, for example when training on TPU.")
            chunk_stride = inferred_batch_size * max(
                1, max_chunks // inferred_batch_size) + 1

            def collapse_nested_datasets(example):
                """Converts a dataset of datasets to a dataset of tensor features."""
                new_example = {}
                for k, v in example.items():
                    v = tf.data.experimental.get_single_element(
                        v.batch(inferred_batch_size, drop_remainder=True))
                    new_example[k] = v
                return tf.data.Dataset.from_tensor_slices(new_example)

            dataset = dataset.unbatch()
            dataset = dataset.window(inferred_batch_size, inferred_batch_size,
                                     chunk_stride)
            dataset = dataset.flat_map(collapse_nested_datasets)
            dataset = dataset.batch(inferred_batch_size, drop_remainder=True)

    def prepare_for_output(example):
        if not config or not config.use_tpu:
            _summarize_features(example, num_shards)
        if mode == tf.estimator.ModeKeys.PREDICT:
            example["infer_targets"] = example.pop("targets")
            return example
        else:
            return example, example[hparams.get(key="labels_feature_name",
                                                default="targets")]

    dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads)
    dataset = dataset.prefetch(2)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # This is because of a bug in the Estimator that short-circuits prediction
        # if it doesn't see a QueueRunner. DummyQueueRunner implements the
        # minimal expected interface but does nothing.
        tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner())

    return dataset
Exemple #36
0
def decode_once(estimator,
                problem_name,
                hparams,
                infer_input_fn,
                decode_hp,
                decode_to_file,
                output_dir,
                log_results=True,
                checkpoint_path=None):
  """Decodes once.

  Args:
    estimator: tf.estimator.Estimator instance. Used to generate encoded
      predictions.
    problem_name: str. Name of problem.
    hparams: tf.HParams instance. HParams for model training.
    infer_input_fn: zero-arg function. Input function for estimator.
    decode_hp: tf.HParams instance. See decode_hparams() above.
    decode_to_file: str. Prefix for filenames. Used to generated filenames to
      which decoded predictions are written.
    output_dir: str. Output directory. Only used for writing images.
    log_results: bool. If False, return encoded predictions without any
      further processing.
    checkpoint_path: str. Path to load model checkpoint from. If unspecified,
      Estimator's default is used.

  Returns:
    If decode_hp.decode_in_memory is True:
      List of dicts, one per example. Values are either numpy arrays or decoded
      strings.
    If decode_hp.decode_in_memory is False:
      An empty list.
  """

  # Get the predictions as an iterable
  predictions = estimator.predict(infer_input_fn,
                                  checkpoint_path=checkpoint_path)

  if not log_results:
    return list(predictions)

  # Prepare output file writers if decode_to_file passed
  decode_to_file = decode_to_file or decode_hp.decode_to_file
  if decode_to_file:
    output_filepath = _decode_filename(decode_to_file, problem_name, decode_hp)
    parts = output_filepath.split(".")
    parts[-1] = "targets"
    target_filepath = ".".join(parts)
    parts[-1] = "inputs"
    input_filepath = ".".join(parts)

    output_file = tf.gfile.Open(output_filepath, "w")
    target_file = tf.gfile.Open(target_filepath, "w")
    input_file = tf.gfile.Open(input_filepath, "w")

  problem_hparams = hparams.problem_hparams
  # Inputs vocabulary is set to targets if there are no inputs in the problem,
  # e.g., for language models where the inputs are just a prefix of targets.
  has_input = "inputs" in problem_hparams.vocabulary
  inputs_vocab_key = "inputs" if has_input else "targets"
  inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
  targets_vocab = problem_hparams.vocabulary["targets"]

  num_eval_samples = 0

  # all_outputs[i][j] = (input: str, output: str, target: str). Input,
  # decoded output, and target strings for example i, beam rank j.
  all_outputs = []
  for num_predictions, prediction in enumerate(predictions):
    num_eval_samples += 1
    num_predictions += 1
    inputs = prediction.get("inputs")
    targets = prediction.get("targets")
    outputs = prediction.get("outputs")

    # Log predictions
    decoded_outputs = []  # [(str, str, str)]. See all_outputs above.
    if decode_hp.decode_in_memory:
      all_outputs.append(decoded_outputs)
    decoded_scores = []
    if decode_hp.return_beams or decode_hp.multi_targets:
      output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
      scores = None
      if "scores" in prediction:
        scores = np.split(prediction["scores"], 
                          prediction["scores"].shape[0],
                          axis=0)
      target_i = targets
      for i, beam in enumerate(output_beams):
        if decode_hp.multi_targets and decode_hp.return_beams:
          beam_id = (i % decode_hp.beam_size)
          target_id = i // (decode_hp.beam_size)
          target_i = targets[target_id]
          if not beam_id:
            tf.logging.info("TARGET %d:" % target_id)
          tf.logging.info("BEAM %d:" % beam_id)
        elif decode_hp.multi_targets:
          target_i = targets[i]
          tf.logging.info("TARGET %d:" % i)
        else:
          tf.logging.info("BEAM %d:" % i)
        score = scores and scores[i]
        decoded = log_decode_results(
            inputs,
            beam,
            problem_name,
            num_predictions,
            inputs_vocab,
            targets_vocab,
            save_images=decode_hp.save_images,
            output_dir=output_dir,
            identity_output=decode_hp.identity_output,
            targets=target_i,
            log_results=log_results)
        decoded_outputs.append(decoded)
        if decode_hp.write_beam_scores:
          decoded_scores.append(score)
    else:
      decoded = log_decode_results(
          inputs,
          outputs,
          problem_name,
          num_predictions,
          inputs_vocab,
          targets_vocab,
          save_images=decode_hp.save_images,
          output_dir=output_dir,
          identity_output=decode_hp.identity_output,
          targets=targets,
          log_results=log_results)
      decoded_outputs.append(decoded)

    # Write out predictions if decode_to_file passed
    if decode_to_file:
      for i, (d_input, d_output, d_target) in enumerate(decoded_outputs):
        # Skip if all padding
        if d_input and re.match("^({})+$".format(text_encoder.PAD), d_input):
          continue
        beam_score_str = ""
        if decode_hp.write_beam_scores:
          beam_score_str = "\t%.2f" % decoded_scores[i]
        output_file.write(str(d_output) + beam_score_str + decode_hp.delimiter)
        target_file.write(str(d_target) + decode_hp.delimiter)
        input_file.write(str(d_input) + decode_hp.delimiter)

    if (decode_hp.num_samples >= 0 and
        num_predictions >= decode_hp.num_samples):
      break

  mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE,
                               value=num_eval_samples,
                               hparams=hparams)

  if decode_to_file:
    output_file.close()
    target_file.close()
    input_file.close()

  return all_outputs
Exemple #37
0
def create_estimator(model_name,
                     hparams,
                     run_config,
                     schedule="train_and_evaluate",
                     decode_hparams=None,
                     use_tpu=False,
                     use_tpu_estimator=False,
                     use_xla=False,
                     export_saved_model_api_version=1,
                     use_guarantee_const_getter=False):
    """Create a T2T Estimator."""
    model_fn = t2t_model.T2TModel.make_estimator_model_fn(
        model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu)

    del use_xla
    if use_tpu or use_tpu_estimator:
        from tensorflow.contrib.tpu.python.tpu import tpu_estimator  # pylint: disable=g-import-not-at-top
        problem = hparams.problem
        batch_size = (problem.tpu_batch_size_per_shard(hparams) *
                      run_config.tpu_config.num_shards)
        mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE,
                                     value=batch_size)
        if getattr(hparams, "mtf_mode", False):
            batch_size = problem.tpu_batch_size_per_shard(hparams)
        predict_batch_size = batch_size
        if decode_hparams and decode_hparams.batch_size:
            predict_batch_size = decode_hparams.batch_size
        if decode_hparams and run_config.tpu_config:
            decode_hparams.add_hparam(
                "iterations_per_loop",
                run_config.tpu_config.iterations_per_loop)
        if export_saved_model_api_version == 1:
            api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V1
            estimator_model_fn = model_fn
        elif export_saved_model_api_version == 2:
            api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V2

            def maybe_use_guarantee_const_getter_model_fn(
                    features, labels, mode, params):
                """Wrapper model_fn with guarantee_const getter."""
                if not use_guarantee_const_getter:
                    return model_fn(features, labels, mode, params)

                # It marks all weights as constant, which may improves TPU inference
                # performance because it prevents the weights being transferred to the
                # TPU. It will increase HBM "program" usage and reduce HBM "arguments"
                # usage during TPU model serving.
                def guarantee_const_getter(getter, name, *args, **kwargs):
                    with tf.control_dependencies(None):
                        return tf.guarantee_const(
                            getter(name, *args, **kwargs),
                            name=name + "/GuaranteeConst")

                @contextlib.contextmanager
                def guarantee_const_scope():
                    var_scope = tf.get_variable_scope()
                    prev_custom_getter = var_scope.custom_getter
                    prev_caching_device = var_scope.caching_device
                    var_scope.set_custom_getter(guarantee_const_getter)
                    var_scope.set_caching_device(lambda op: op.device)
                    yield
                    var_scope.set_custom_getter(prev_custom_getter)
                    var_scope.set_caching_device(prev_caching_device)

                with guarantee_const_scope():
                    return model_fn(features, labels, mode, params)

            def tpu_model_fn(features, labels, mode, params):
                """Wrapper model_fn with tpu.rewrite / TPUPartitionedCall."""
                if mode == tf.estimator.ModeKeys.PREDICT and params["use_tpu"]:
                    batch_config = tpu_estimator.BatchConfig(
                        num_batch_threads=2,
                        max_batch_size=predict_batch_size,
                        batch_timeout_micros=60 * 1000,
                        allowed_batch_sizes=[predict_batch_size])
                    return tpu_estimator.model_fn_inference_on_tpu(
                        maybe_use_guarantee_const_getter_model_fn,
                        features=features,
                        labels=labels,
                        config=None,
                        params=params,
                        batch_config=batch_config)
                else:
                    return model_fn(features, labels, mode, params)

            estimator_model_fn = tpu_model_fn
        else:
            raise ValueError(
                "Flag export_saved_model_api_version must be 1 or 2.")
        estimator = contrib.tpu().TPUEstimator(
            model_fn=estimator_model_fn,
            model_dir=run_config.model_dir,
            config=run_config,
            use_tpu=use_tpu,
            train_batch_size=batch_size,
            eval_batch_size=batch_size if "eval" in schedule else None,
            predict_batch_size=predict_batch_size,
            export_saved_model_api_version=api_version_enum_name)
    else:
        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=run_config.model_dir,
            config=run_config,
        )
    return estimator
Exemple #38
0
def decode_once(estimator,
                problem_name,
                hparams,
                infer_input_fn,
                decode_hp,
                decode_to_file,
                output_dir,
                log_results=True,
                checkpoint_path=None):
  """Decodes once."""

  # Get the predictions as an iterable
  predictions = estimator.predict(infer_input_fn,
                                  checkpoint_path=checkpoint_path)

  if not log_results:
    return list(predictions)

  # Prepare output file writers if decode_to_file passed
  decode_to_file = decode_to_file or decode_hp.decode_to_file
  if decode_to_file:
    output_filepath = _decode_filename(decode_to_file, problem_name, decode_hp)
    parts = output_filepath.split(".")
    parts[-1] = "targets"
    target_filepath = ".".join(parts)
    parts[-1] = "inputs"
    input_filepath = ".".join(parts)

    output_file = tf.gfile.Open(output_filepath, "w")
    target_file = tf.gfile.Open(target_filepath, "w")
    input_file = tf.gfile.Open(input_filepath, "w")

  problem_hparams = hparams.problem_hparams
  # Inputs vocabulary is set to targets if there are no inputs in the problem,
  # e.g., for language models where the inputs are just a prefix of targets.
  has_input = "inputs" in problem_hparams.vocabulary
  inputs_vocab_key = "inputs" if has_input else "targets"
  inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
  targets_vocab = problem_hparams.vocabulary["targets"]

  num_eval_samples = 0
  for num_predictions, prediction in enumerate(predictions):
    num_eval_samples += 1
    num_predictions += 1
    inputs = prediction.get("inputs")
    targets = prediction.get("targets")
    outputs = prediction.get("outputs")

    # Log predictions
    decoded_outputs = []
    decoded_scores = []
    if decode_hp.return_beams:
      output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
      scores = None
      if "scores" in prediction:
        scores = np.split(prediction["scores"], decode_hp.beam_size, axis=0)
      for i, beam in enumerate(output_beams):
        tf.logging.info("BEAM %d:" % i)
        score = scores and scores[i]
        decoded = log_decode_results(
            inputs,
            beam,
            problem_name,
            num_predictions,
            inputs_vocab,
            targets_vocab,
            save_images=decode_hp.save_images,
            output_dir=output_dir,
            identity_output=decode_hp.identity_output,
            targets=targets,
            log_results=decode_hp.log_results)
        decoded_outputs.append(decoded)
        if decode_hp.write_beam_scores:
          decoded_scores.append(score)
    else:
      decoded = log_decode_results(
          inputs,
          outputs,
          problem_name,
          num_predictions,
          inputs_vocab,
          targets_vocab,
          save_images=decode_hp.save_images,
          output_dir=output_dir,
          identity_output=decode_hp.identity_output,
          targets=targets,
          log_results=decode_hp.log_results)
      decoded_outputs.append(decoded)

    # Write out predictions if decode_to_file passed
    if decode_to_file:
      for i, (d_input, d_output, d_target) in enumerate(decoded_outputs):
        # Skip if all padding
        if d_input and re.match("^({})+$".format(text_encoder.PAD), d_input):
          continue
        beam_score_str = ""
        if decode_hp.write_beam_scores:
          beam_score_str = "\t%.2f" % decoded_scores[i]
        output_file.write(str(d_output) + beam_score_str + decode_hp.delimiter)
        target_file.write(str(d_target) + decode_hp.delimiter)
        input_file.write(str(d_input) + decode_hp.delimiter)

    if (decode_hp.num_samples >= 0 and
        num_predictions >= decode_hp.num_samples):
      break

  mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE,
                               value=num_eval_samples,
                               hparams=hparams)

  if decode_to_file:
    output_file.close()
    target_file.close()
    input_file.close()
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True):
    """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.

  Returns:
    y: a Tensors
  """
    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                 value=hparams.num_encoder_layers
                                 or hparams.num_hidden_layers)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
                                 value=hparams.attention_dropout)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
                                 value={
                                     "use_bias": "false",
                                     "num_heads": hparams.num_heads,
                                     "hidden_size": hparams.hidden_size
                                 })

    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            padding = common_attention.attention_bias_to_padding(
                encoder_self_attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            pad_remover = expert_utils.PadRemover(padding)
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):

            initial_sparsity = None
            if hparams.get("load_masks_from"):
                initial_sparsity = hparams.get("initial_sparsity")

            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = sparse_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        encoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"),
                        sparsity_technique=hparams.get("sparsity_technique"),
                        threshold=hparams.get("log_alpha_threshold"),
                        training=hparams.get(
                            "mode") == tf_estimator.ModeKeys.TRAIN,
                        clip_alpha=hparams.get("clip_log_alpha"),
                        initial_sparsity=initial_sparsity,
                        split_heads=hparams.get("split_heads"))
                    x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        common_layers.layer_preprocess(x, hparams), hparams,
                        pad_remover)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_NORM,
            value={"hidden_size": hparams.hidden_size})
        return common_layers.layer_preprocess(x, hparams)
Exemple #40
0
def moe_transformer_encoder(encoder_input,
                            encoder_self_attention_bias,
                            hparams,
                            name="moe-encoder",
                            nonpadding=None,
                            save_weights_to=None,
                            make_image_summary=True,
                            attn_bias_for_padding=None):
  """A stack of transformer moe layers.
  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses
    attn_bias_for_padding: Padded attention bias in case a unidirectional
      encoder is being used where future attention is masked.
  Returns:
    y: a Tensors
  """
  x = encoder_input
  attention_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "attention_dropout_broadcast_dims", "")))
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
      value=hparams.num_encoder_layers or hparams.num_hidden_layers)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
      value=hparams.attention_dropout)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
      value={
          "use_bias": "false",
          "num_heads": hparams.num_heads,
          "hidden_size": hparams.hidden_size
      })

  with tf.variable_scope(name):
    if nonpadding is not None:
      padding = 1.0 - nonpadding
    else:
      attention_bias = encoder_self_attention_bias
      if attn_bias_for_padding is not None:
        attention_bias = attn_bias_for_padding
      padding = common_attention.attention_bias_to_padding(attention_bias)
      nonpadding = 1.0 - padding
    for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        x = moe_transformer_encoder_layer(
            layer,
            x,
            encoder_self_attention_bias,
            hparams,
            attention_dropout_broadcast_dims,
            save_weights_to,
            make_image_summary)
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NORM,
        value={"hidden_size": hparams.hidden_size})
    return common_layers.layer_preprocess(x, hparams)