Example #1
0
def transformer_prepare_decoder(targets, hparams, features=None):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  decoder_self_attention_bias = (
      common_attention.attention_bias_lower_triangle(
          common_layers.shape_list(targets)[1]))
  if features and "targets_segmentation" in features:
    # "Packed" dataset - keep the examples from seeing each other.
    targets_segmentation = features["targets_segmentation"]
    targets_position = features["targets_position"]
    decoder_self_attention_bias += common_attention.attention_bias_same_segment(
        targets_segmentation, targets_segmentation)
  else:
    targets_position = None
  if hparams.proximity_bias:
    decoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(targets)[1])
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    if targets_position is not None:
      decoder_input = common_attention.add_timing_signal_1d_given_position(
          decoder_input, targets_position)
    else:
      decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
Example #2
0
def transformer_prepare_decoder(targets, hparams, features=None):
  """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
  decoder_self_attention_bias = (
      common_attention.attention_bias_lower_triangle(
          common_layers.shape_list(targets)[1]))
  if features and "targets_segmentation" in features:
    # "Packed" dataset - keep the examples from seeing each other.
    targets_segmentation = features["targets_segmentation"]
    targets_position = features["targets_position"]
    decoder_self_attention_bias += common_attention.attention_bias_same_segment(
        targets_segmentation, targets_segmentation)
  else:
    targets_position = None
  if hparams.proximity_bias:
    decoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(targets)[1])
  decoder_input = common_layers.shift_right_3d(targets)
  if hparams.pos == "timing":
    if targets_position is not None:
      decoder_input = common_attention.add_timing_signal_1d_given_position(
          decoder_input, targets_position)
    else:
      decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
    def decode(self, decoder_inputs, timestep):
        """
        Args:
            decoder_inputs: targets of shape [batch_size,sequence_length,
                            hidden_size]. Sequence is shifter right
                            by one.
            timestep: used for timestep encoding during ACT
        Return:
            decoder_outputs: the result of passing the decoder_input
                             through the edecoderlayers.
                             Input shape is preserved.

        This function is one step of decoding.
        """
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            #positional encoding
            x = common_attention.add_timing_signal_1d(decoder_inputs)
            #timestep encoding
            x = common_attention.add_timing_signal_1d_given_position(
                x, timestep)
            #decoder-decoder attention
            y = common_attention.multihead_attention(
                query_antecedent=x,
                memory_antecedent=None,
                bias=self.decoder_attention_bias,
                total_key_depth=self.hparams.hidden_size,
                total_value_depth=self.hparams.hidden_size,
                output_depth=self.hparams.hidden_size,
                num_heads=self.hparams.num_heads,
                dropout_rate=self.hparams.attention_dropout)
            #residual connection and dropout
            x = common_layers.layer_postprocess(x, y, self.hparams)
            #layer norm
            x = common_layers.layer_norm(x)
            #encoder-decoder attention
            y = common_attention.multihead_attention(
                query_antecedent=x,
                memory_antecedent=self.encoder_outputs,
                bias=self.encoder_attention_bias,
                total_key_depth=self.hparams.hidden_size,
                total_value_depth=self.hparams.hidden_size,
                output_depth=self.hparams.hidden_size,
                num_heads=self.hparams.num_heads,
                dropout_rate=self.hparams.attention_dropout)
            #residual connection and dropout
            x = common_layers.layer_postprocess(x, y, self.hparams)
            #layer norm
            x = common_layers.layer_norm(x)
            #transition function as fc
            y = tf.layers.dense(x, self.hparams.hidden_size, name="transition")
            #residual connection and dropout
            x = common_layers.layer_postprocess(x, y, self.hparams)
            #layer norm
            x = common_layers.layer_norm(x)
            return x
Example #4
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  if features and "inputs_segmentation" in features:
    # Packed dataset.  Keep the examples from seeing each other.
    inputs_segmentation = features["inputs_segmentation"]
    inputs_position = features["inputs_position"]
    targets_segmentation = features["targets_segmentation"]
    encoder_self_attention_bias = common_attention.attention_bias_same_segment(
        inputs_segmentation, inputs_segmentation)
    encoder_decoder_attention_bias = (
        common_attention.attention_bias_same_segment(
            targets_segmentation, inputs_segmentation))
  else:
    # Usual case - not a packed dataset.
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    inputs_position = None
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(inputs)[1])
  # Append target_space_id embedding to inputs.
  emb_target_space = common_layers.embedding(
      target_space, 32, ishape_static[-1], name="target_space_embedding")
  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
  encoder_input += emb_target_space
  if hparams.pos == "timing":
    if inputs_position is not None:
      encoder_input = common_attention.add_timing_signal_1d_given_position(
          encoder_input, inputs_position)
    else:
      encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #5
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
      sg: inputs here have been flattened to 3d
        [batch, height, width, embed_size] ->
        [batch, height*width, embed_size]
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        # sg: [batch_size, sentence_len]
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        # sg: [batch_size, 1, 1, sentence_len]
        # an bias tensor to be added to attention logits
        # for padded words, the biases equal -1e9
        # non padded words equal 0
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        target_space,
        32,
        # sg: 32 vocab_size (comments in fun, may be not exactly)
        # this is because at current time t2t only have
        # SpaceID in problem.py from 1 to 32
        ishape_static[-1],
        # sg: embedding dimension
        name="target_space_embedding",
        dtype=tf.bfloat16
        if hparams.activation_dtype == "bfloat16" else tf.float32)
    # sg: [1,128] a dense vector to represent SpaceID
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    # sg: [1,1,128]
    encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    if hparams.activation_dtype == "bfloat16":
        encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                              tf.bfloat16)
        encoder_decoder_attention_bias = tf.cast(
            encoder_decoder_attention_bias, tf.bfloat16)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)