def dual_decode(self,
                  decoder_input,
                  wav_encoder_output,
                  txt_encoder_output,
                  wav_enc_dec_attention_bias,
                  txt_enc_dec_attention_bias,
                  decoder_self_attention_bias,
                  hparams,
                  cache=None,
                  nonpadding=None,
                  losses=None):
    """ dual transformer decoder, attention to both inputs """
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)
    decoder_output = transformer_dual_decoder(
            decoder_input,
            wav_encoder_output,txt_encoder_output,
            decoder_self_attention_bias,
            wav_enc_dec_attention_bias,
            txt_enc_dec_attention_bias,
            hparams,
            cache=cache,
            nonpadding=nonpadding,
            save_weights_to=self.attention_weights,
            losses=losses)

    if (common_layers.is_on_tpu() and
        hparams.mode == tf.estimator.ModeKeys.TRAIN):
      # TPU does not react kindly to extra dimensions.
      # TODO(noam): remove this once TPU is more forgiving of extra dims.
      return decoder_output
    else:
      # Expand since t2t expects 4d tensors.
      return tf.expand_dims(decoder_output, axis=2)
Example #2
0
    def estimator_spec_eval(self, features, logits, labels, loss):
        """Construct EstimatorSpec for EVAL mode."""
        hparams = self.hparams

        if not hasattr(hparams, "problem_instances"):
            raise NotImplementedError(_no_problem_err("estimator_spec_eval"))

        problem = hparams.problem_instances[0]
        if common_layers.is_on_tpu():
            eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams)
            _remove_summaries()
            return tf.contrib.tpu.TPUEstimatorSpec(
                tf.estimator.ModeKeys.EVAL,
                eval_metrics=(eval_metrics_fn, [logits, labels]),
                loss=loss)
        else:
            eval_metrics_fns = metrics.create_evaluation_metrics([problem],
                                                                 hparams)
            eval_metrics = {}
            for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
                eval_metrics[metric_name] = metric_fn(logits, features)

            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.EVAL,
                predictions={"predictions": logits},
                eval_metric_ops=eval_metrics,
                loss=loss)
Example #3
0
  def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
    """Construct EstimatorSpec for EVAL mode."""
    hparams = self.hparams

    if not hasattr(hparams, "problem_instances"):
      raise NotImplementedError(_no_problem_err("estimator_spec_eval"))

    problem = hparams.problem_instances[0]
    if common_layers.is_on_tpu():
      eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams)
      _remove_summaries()
      if isinstance(logits, dict):
        # For TPU, logits dict will be passed as keyword arguments to
        # eval_metrics_fn. Here we add the labels to those arguments.
        logits.update({"labels": labels})
        return tf.contrib.tpu.TPUEstimatorSpec(
            tf.estimator.ModeKeys.EVAL,
            eval_metrics=(eval_metrics_fn, logits),
            loss=loss)
      else:
        return tf.contrib.tpu.TPUEstimatorSpec(
            tf.estimator.ModeKeys.EVAL,
            eval_metrics=(eval_metrics_fn, [logits, labels]),
            loss=loss)
    else:
      eval_metrics_fns = metrics.create_evaluation_metrics([problem], hparams)
      eval_metrics = {}
      for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
        eval_metrics[metric_name] = metric_fn(logits, features)

      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.EVAL,
          predictions={"predictions": logits},
          eval_metric_ops=eval_metrics,
          loss=loss)
    def body(self, features):
        assert self._hparams.block_size > 0
        assert not common_layers.is_on_tpu()

        hparams = copy.copy(self._hparams)
        targets = features["targets"]
        inputs = features["inputs"]
        if not (tf.get_variable_scope().reuse
                or hparams.mode == tf.contrib.learn.ModeKeys.INFER):
            tf.summary.image("inputs", inputs, max_outputs=1)
            tf.summary.image("targets", targets, max_outputs=1)

        encoder_input = cia.prepare_encoder(inputs, hparams)
        encoder_output = cia.transformer_encoder_layers(
            encoder_input,
            hparams.num_encoder_layers,
            hparams,
            attention_type=hparams.enc_attention_type,
            name="encoder")
        decoder_input, rows, cols = cia.prepare_decoder(targets, hparams)
        decoder_output = cia.transformer_decoder_layers(
            decoder_input,
            encoder_output,
            hparams.num_decoder_layers,
            hparams,
            attention_type=hparams.dec_attention_type,
            name="decoder")

        assert not isinstance(decoder_output, tuple)
        assert len(decoder_output.shape) == 4

        relu_dropout_broadcast_dims = (
            common_layers.comma_separated_string_to_integer_list(
                getattr(self._hparams, "relu_dropout_broadcast_dims", "")))

        with tf.variable_scope("block_size_%d" % self._hparams.block_size):
            tf.logging.info("Using block_size %d", self._hparams.block_size)
            block_output = common_layers.dense_relu_dense(
                decoder_output,
                self._hparams.block_size * self._hparams.filter_size,
                self._hparams.block_size * self._hparams.hidden_size,
                dropout=self._hparams.relu_dropout,
                dropout_broadcast_dims=relu_dropout_broadcast_dims)

        batch_size, rows, cols = common_layers.shape_list(decoder_output)[:3]
        decoder_output = tf.reshape(
            decoder_output,
            [batch_size, rows, cols, 1, self._hparams.hidden_size])
        block_output = tf.reshape(block_output, [
            batch_size, rows, cols, self._hparams.block_size,
            self._hparams.hidden_size
        ])

        block_output = common_layers.layer_postprocess(decoder_output,
                                                       block_output,
                                                       self._hparams)

        return block_output
def transformer_n_encoder(encoder_input,
                          encoder_self_attention_bias,
                          hparams,
                          customize_params,
                          name="encoder",
                          nonpadding=None,
                          save_weights_to=None,
                          make_image_summary=True,
                          losses=None):
  """ transformer with 2 sets of encoders """
  x = encoder_input
  attention_dropout_broadcast_dims = (
    common_layers.comma_separated_string_to_integer_list(
      getattr(hparams, "attention_dropout_broadcast_dims", "")))
  with tf.variable_scope(name):
    if nonpadding is not None:
      padding = 1.0 - nonpadding
    else:
      padding = common_attention.attention_bias_to_padding(
        encoder_self_attention_bias)
      nonpadding = 1.0 - padding
    pad_remover = None
    if hparams.use_pad_remover and not common_layers.is_on_tpu():
      pad_remover = expert_utils.PadRemover(padding)
    for layer in range(customize_params.num_layers or
                       hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        with tf.variable_scope("self_attention"):
          y = common_attention.multihead_attention(
            common_layers.layer_preprocess(x, hparams),
            None,
            encoder_self_attention_bias,
            hparams.attention_key_channels or hparams.hidden_size,
            hparams.attention_value_channels or hparams.hidden_size,
            hparams.hidden_size,
            customize_params.num_heads or hparams.num_heads,
            hparams.attention_dropout,
            attention_type=hparams.self_attention_type,
            save_weights_to=save_weights_to,
            max_relative_position=hparams.max_relative_position,
            make_image_summary=make_image_summary,
            dropout_broadcast_dims=attention_dropout_broadcast_dims,
            max_length=customize_params.get("max_length"))
          x = common_layers.layer_postprocess(x, y, hparams)
        with tf.variable_scope("ffn"):
          y = transformer_ffn_layer(
            common_layers.layer_preprocess(x, hparams),
            customized_ffn=customize_params.ffn_layer,
            hparams=hparams,
            pad_remover=pad_remover,
            conv_padding="SAME", nonpadding_mask=nonpadding,
            losses=losses)
          x = common_layers.layer_postprocess(x, y, hparams)
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    return common_layers.layer_preprocess(x, hparams)
  def decode(self,
             decoder_input,
             encoder_output,
             encoder_decoder_attention_bias,
             decoder_self_attention_bias,
             hparams,
             name,
             cache=None,
             decode_loop_step=None,
             nonpadding=None,
             losses=None):
    """Decode Transformer outputs from encoder representation.

    Args:
      decoder_input: inputs to bottom of the model.
          [batch_size, decoder_length, hidden_dim]
      encoder_output: Encoder representation.
          [batch_size, input_length, hidden_dim]
      encoder_decoder_attention_bias: Bias and mask weights for
          encoder-decoder attention. [batch_size, input_length]
      decoder_self_attention_bias: Bias and mask weights for decoder
          self-attention. [batch_size, decoder_length]
      hparams: hyperparameters for model.
      cache: dict, containing tensors which are the results of previous
          attentions, used for fast decoding.
      decode_loop_step: An integer, step number of the decoding loop.
          Only used for inference on TPU.
      nonpadding: optional Tensor with shape [batch_size, decoder_length]
      losses: optional list onto which to append extra training losses

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer_decoder(
        decoder_input,
        encoder_output,
        decoder_self_attention_bias,
        encoder_decoder_attention_bias,
        hparams,
        name=name,
        cache=cache,
        decode_loop_step=decode_loop_step,
        nonpadding=nonpadding,
        save_weights_to=self.attention_weights,
        losses=losses)

    if (common_layers.is_on_tpu() and
        hparams.mode == tf.estimator.ModeKeys.TRAIN):
      # TPU does not react kindly to extra dimensions.
      # TODO(noam): remove this once TPU is more forgiving of extra dims.
      return decoder_output
    else:
      # Expand since t2t expects 4d tensors.
      return tf.expand_dims(decoder_output, axis=2)
Example #7
0
 def optimize(self, loss, num_async_replicas=1):
   """Return a training op minimizing loss."""
   log_info("Base learning rate: %f", self.hparams.learning_rate)
   lr = learning_rate.learning_rate_schedule(self.hparams)
   if num_async_replicas > 1:
     log_info("Dividing learning rate by num_async_replicas: %d",
              num_async_replicas)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(
       loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu())
   return train_op
Example #8
0
 def optimize(self, loss, num_async_replicas=1):
   """Return a training op minimizing loss."""
   log_info("Base learning rate: %f", self.hparams.learning_rate)
   lr = learning_rate.learning_rate_schedule(self.hparams)
   if num_async_replicas > 1:
     log_info("Dividing learning rate by num_async_replicas: %d",
              num_async_replicas)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(
       loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu())
   return train_op
Example #9
0
  def estimator_spec_train(self, loss, num_async_replicas=1):
    """Construct EstimatorSpec for TRAIN mode."""
    train_op = self.optimize(loss, num_async_replicas=num_async_replicas)

    if common_layers.is_on_tpu():
      _remove_summaries()  # summaries not currently working on TPU
      return tf.contrib.tpu.TPUEstimatorSpec(
          tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
    else:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
Example #10
0
  def estimator_spec_train(self, loss, num_async_replicas=1):
    """Construct EstimatorSpec for TRAIN mode."""
    train_op = self.optimize(loss, num_async_replicas=num_async_replicas)

    if common_layers.is_on_tpu():
      _remove_summaries()  # summaries not currently working on TPU
      return tf.contrib.tpu.TPUEstimatorSpec(
          tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
    else:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
Example #11
0
    def decode(self,
               decoder_input,
               encoder_output,
               encoder_decoder_attention_bias,
               decoder_self_attention_bias,
               hparams,
               cache=None,
               nonpadding=None):
        """Decode Transformer outputs from encoder representation.

    Args:
      decoder_input: inputs to bottom of the model.
          [batch_size, decoder_length, hidden_dim]
      encoder_output: Encoder representation.
          [batch_size, input_length, hidden_dim]
      encoder_decoder_attention_bias: Bias and mask weights for
          encoder-decoder attention. [batch_size, input_length]
      decoder_self_attention_bias: Bias and mask weights for decoder
          self-attention. [batch_size, decoder_length]
      hparams: hyperparmeters for model.
      cache: dict, containing tensors which are the results of previous
          attentions, used for fast decoding.
      nonpadding: optional Tensor with shape [batch_size, decoder_length]

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
        decoder_input = tf.nn.dropout(
            decoder_input, 1.0 - hparams.layer_prepostprocess_dropout)

        decoder_output = transformer_decoder(
            decoder_input,
            encoder_output,
            decoder_self_attention_bias,
            encoder_decoder_attention_bias,
            hparams,
            cache=cache,
            nonpadding=nonpadding,
            save_weights_to=self.attention_weights)

        if (common_layers.is_on_tpu()
                and hparams.mode == tf.estimator.ModeKeys.TRAIN):
            # TPU does not react kindly to extra dimensions.
            # TODO(noam): remove this once TPU is more forgiving of extra dims.
            return decoder_output
        else:
            # Expand since t2t expects 4d tensors.
            m = tf.py_func(self.sentence_cache.QueryMultipleEntries,
                           [decoder_output], tf.float32)
            m.set_shape(decoder_output.get_shape())
            lambd = self.calculate_mixing_weight(decoder_output, m)
            return tf.expand_dims(lambd * decoder_output + (1.0 - lambd) * m,
                                  axis=2)
Example #12
0
  def decode(self,
             decoder_input,
             encoder_output,
             encoder_decoder_attention_bias,
             decoder_self_attention_bias,
             hparams,
             cache=None,
             nonpadding=None):
    """Decode Transformer outputs from encoder representation.

    Args:
      decoder_input: inputs to bottom of the model.
          [batch_size, decoder_length, hidden_dim]
      encoder_output: Encoder representation.
          [batch_size, input_length, hidden_dim]
      encoder_decoder_attention_bias: Bias and mask weights for
          encoder-decoder attention. [batch_size, input_length]
      decoder_self_attention_bias: Bias and mask weights for decoder
          self-attention. [batch_size, decoder_length]
      hparams: hyperparmeters for model.
      cache: dict, containing tensors which are the results of previous
          attentions, used for fast decoding.
      nonpadding: optional Tensor with shape [batch_size, decoder_length]

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer_decoder(
        decoder_input,
        encoder_output,
        decoder_self_attention_bias,
        encoder_decoder_attention_bias,
        hparams,
        cache=cache,
        nonpadding=nonpadding,
        save_weights_to=self.attention_weights)

    if (common_layers.is_on_tpu() and
        hparams.mode == tf.estimator.ModeKeys.TRAIN):
      # TPU does not react kindly to extra dimensions.
      # TODO(noam): remove this once TPU is more forgiving of extra dims.
      return decoder_output
    else:
      # Expand since t2t expects 4d tensors.
      return tf.expand_dims(decoder_output, axis=2)
Example #13
0
  def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
    """Construct EstimatorSpec for EVAL mode."""
    hparams = self.hparams

    if not hasattr(hparams, "problem_instances"):
      raise NotImplementedError(_no_problem_err("estimator_spec_eval"))

    problem = hparams.problem_instances[0]
    if common_layers.is_on_tpu():
      _remove_summaries()
      if isinstance(logits, dict):
        eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams)
        # For TPU, logits dict will be passed as keyword arguments to
        # eval_metrics_fn. Here we add the labels to those arguments.
        logits.update({"labels": labels})
        return tf.contrib.tpu.TPUEstimatorSpec(
            tf.estimator.ModeKeys.EVAL,
            eval_metrics=(eval_metrics_fn, logits),
            loss=loss)
      else:
        eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams)
        return tf.contrib.tpu.TPUEstimatorSpec(
            tf.estimator.ModeKeys.EVAL,
            eval_metrics=(eval_metrics_fn, [logits, labels]),
            loss=loss)
    else:
      eval_metrics_fns = metrics.create_evaluation_metrics([problem], hparams)
      eval_metrics = {}

      for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
        if isinstance(logits, dict):
          # the key is located in the center of metric_name: "metrics-%s/%s/%s"
          k = metric_name.split("/")[1]
          eval_metrics[metric_name] = metric_fn(logits[k], features)
        else:
          eval_metrics[metric_name] = metric_fn(logits, features)

      if isinstance(logits, dict):
        predictions = logits
      else:
        predictions = {"predictions": logits}

      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.EVAL,
          predictions=predictions,
          eval_metric_ops=eval_metrics,
          loss=loss)
Example #14
0
 def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
     tf.logging.info("Base learning rate: %f", self.hparams.learning_rate)
     lr = self.hparams.learning_rate
     decay_rate = optimize.learning_rate_schedule(self.hparams)
     lr *= decay_rate
     if self.hparams.learning_rate_minimum:
         lr_min = float(self.hparams.learning_rate_minimum)
         tf.logging.info("Applying learning rate minimum: %f", lr_min)
         lr = tf.max(lr, tf.to_float(lr_min))
     if num_async_replicas > 1:
         tf.logging.info("Dividing learning rate by num_async_replicas: %d",
                         num_async_replicas)
     lr /= math.sqrt(float(num_async_replicas))
     train_op = optimize.optimize(loss,
                                  lr,
                                  self.hparams,
                                  use_tpu=common_layers.is_on_tpu())
     return train_op
Example #15
0
    def top(self, body_output, _):
        """Generate logits.

    Args:
      body_output: A Tensor with shape [batch, p0, p1, body_input_depth]
    Returns:
      logits: A Tensor with shape  [batch, p0, p1, ?, vocab_size].
    """
        if self._model_hparams.symbol_modality_skip_top:
            return tf.expand_dims(body_output, 3)

        if self._model_hparams.shared_embedding_and_softmax_weights:
            scope_name = "shared"
            reuse = True
        else:
            scope_name = "softmax"
            reuse = False

        with tf.variable_scope(scope_name, reuse=reuse):
            body_output_shape = common_layers.shape_list(body_output)
            var = self._get_weights(body_output_shape[-1])
            if (self._model_hparams.factored_logits and
                    self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
                # insert channels dimension
                body_output = tf.expand_dims(body_output, 3)
                return common_layers.FactoredTensor(body_output, var)
            else:
                body_output = tf.reshape(body_output,
                                         [-1, body_output_shape[-1]])
                logits = tf.matmul(body_output, var, transpose_b=True)
                if (common_layers.is_on_tpu() and self._model_hparams.mode
                        == tf.estimator.ModeKeys.TRAIN):
                    # TPU does not react kindly to extra dimensions.
                    # TODO(noam): remove this once TPU is more forgiving of extra dims.
                    return logits
                else:
                    return tf.reshape(
                        logits, body_output_shape[:-1] + [1, self._vocab_size])
Example #16
0
  def top(self, body_output, _):
    """Generate logits.

    Args:
      body_output: A Tensor with shape [batch, p0, p1, body_input_depth]
    Returns:
      logits: A Tensor with shape  [batch, p0, p1, ?, vocab_size].
    """
    if self._model_hparams.symbol_modality_skip_top:
      return tf.expand_dims(body_output, 3)

    if self._model_hparams.shared_embedding_and_softmax_weights:
      scope_name = "shared"
      reuse = True
    else:
      scope_name = "softmax"
      reuse = False

    with tf.variable_scope(scope_name, reuse=reuse):
      body_output_shape = common_layers.shape_list(body_output)
      var = self._get_weights(body_output_shape[-1])
      if (self._model_hparams.factored_logits and
          self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
        # insert channels dimension
        body_output = tf.expand_dims(body_output, 3)
        return common_layers.FactoredTensor(body_output, var)
      else:
        body_output = tf.reshape(body_output, [-1, body_output_shape[-1]])
        logits = tf.matmul(body_output, var, transpose_b=True)
        if (common_layers.is_on_tpu() and
            self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
          # TPU does not react kindly to extra dimensions.
          # TODO(noam): remove this once TPU is more forgiving of extra dims.
          return logits
        else:
          return tf.reshape(logits,
                            body_output_shape[:-1] + [1, self._vocab_size])
Example #17
0
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convoltutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for vizualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.

  Returns:
    y: a Tensors
  """
  x = encoder_input
  attention_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "attention_dropout_broadcast_dims", "")))
  with tf.variable_scope(name):
    if nonpadding is not None:
      padding = 1.0 - nonpadding
    else:
      padding = common_attention.attention_bias_to_padding(
          encoder_self_attention_bias)
      nonpadding = 1.0 - padding
    pad_remover = None
    if hparams.use_pad_remover and not common_layers.is_on_tpu():
      pad_remover = expert_utils.PadRemover(padding)
    for layer in xrange(hparams.num_encoder_layers or
                        hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        with tf.variable_scope("self_attention"):
          y = common_attention.multihead_attention(
              common_layers.layer_preprocess(x, hparams),
              None,
              encoder_self_attention_bias,
              hparams.attention_key_channels or hparams.hidden_size,
              hparams.attention_value_channels or hparams.hidden_size,
              hparams.hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.self_attention_type,
              save_weights_to=save_weights_to,
              max_relative_position=hparams.max_relative_position,
              make_image_summary=make_image_summary,
              dropout_broadcast_dims=attention_dropout_broadcast_dims)
          x = common_layers.layer_postprocess(x, y, hparams)
        with tf.variable_scope("ffn"):
          y = transformer_ffn_layer(
              common_layers.layer_preprocess(x, hparams), hparams, pad_remover,
              conv_padding="SAME", nonpadding_mask=nonpadding)
          x = common_layers.layer_postprocess(x, y, hparams)
    # if normalization is done in layer_preprocess, then it shuold also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    return common_layers.layer_preprocess(x, hparams)
Example #18
0
    def __init__(self, gpu, checkpoints, config=None):
        self._logger = logging.getLogger('TransformerDecoder')
        self._settings = config.settings if config is not None else TransformerDecoder.Settings(
        )
        self._checkpoints = checkpoints
        self._checkpoint = None
        self._nn_needs_reset = True

        with tf.device('/device:GPU:0' if gpu is not None else '/cpu:0'):
            self._restorer = checkpoints.restorer()

            # Prepare features for feeding into the model.
            self._ph_decode_length = tf.placeholder(dtype=tf.int32)
            self._ph_infer_inputs = tf.placeholder(dtype=tf.int32)
            self._ph_train_inputs = tf.reshape(tf.placeholder(dtype=tf.int32),
                                               shape=[-1, -1, 1, 1])
            self._ph_train_targets = tf.reshape(tf.placeholder(dtype=tf.int32),
                                                shape=[-1, -1, 1, 1])
            self._ph_learning_rate = tf.placeholder(tf.float32, [],
                                                    name='learning_rate')

            # Prepare the model for training
            self._model = registry.model('transformer')(
                self._checkpoints.hparams, tf.estimator.ModeKeys.TRAIN)

            _, losses = self._model({
                "inputs": self._ph_train_inputs,
                "targets": self._ph_train_targets
            })

            self._loss = losses['training']
            self._train_op = optimize.optimize(
                self._loss,
                self._ph_learning_rate,
                self._model.hparams,
                use_tpu=common_layers.is_on_tpu())

            tf.get_variable_scope().reuse_variables()

            # Prepare the model for infer
            self._attention_mats_op = [
                self._model.attention_weights[
                    'transformer/body/decoder/layer_%i/encdec_attention/multihead_attention/dot_product_attention'
                    % i] for i in xrange(self._model.hparams.num_hidden_layers)
            ]

            self._predictions_ops = []
            infer_inputs = tf.reshape(self._ph_infer_inputs,
                                      [1, -1, 1, 1])  # Make it 4D.
            infer_out = self._model.infer({"inputs": infer_inputs},
                                          beam_size=4,
                                          top_beams=1,
                                          alpha=0.6,
                                          decode_length=self._ph_decode_length)

            self._predictions_op = {
                "outputs": infer_out["outputs"],
                "inputs": infer_inputs,
            }

        session_config = tf.ConfigProto(allow_soft_placement=True)
        session_config.gpu_options.allow_growth = True
        if gpu is not None:
            session_config.gpu_options.force_gpu_compatible = True
            session_config.gpu_options.visible_device_list = str(gpu)

        self._session = tf.Session(config=session_config)

        # Init model
        self._warmup()
Example #19
0
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True,
                        losses=None):
    """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses

  Returns:
    y: a Tensors
  """
    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            padding = common_attention.attention_bias_to_padding(
                encoder_self_attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_on_tpu():
            pad_remover = expert_utils.PadRemover(padding)
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # sg: imdb comments
                    y = common_attention.multihead_attention(
                        common_layers.layer_preprocess(
                            x, hparams),  # added layer norm
                        None,
                        encoder_self_attention_bias,
                        hparams.attention_key_channels
                        or hparams.hidden_size,  # 128
                        hparams.attention_value_channels
                        or hparams.hidden_size,  # 128
                        hparams.hidden_size,  # 128
                        hparams.num_heads,  # 4
                        hparams.attention_dropout,  # 0.1
                        attention_type=hparams.
                        self_attention_type,  # 'dot_product'
                        save_weights_to=save_weights_to,
                        max_relative_position=hparams.
                        max_relative_position,  # 0
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"))  # 256
                    x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(common_layers.layer_preprocess(
                        x, hparams),
                                              hparams,
                                              pad_remover,
                                              conv_padding="SAME",
                                              nonpadding_mask=nonpadding,
                                              losses=losses)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)