Ejemplo n.º 1
0
    def test_get_decoder_self_attention_bias(self):
        length = 5
        bias = model_utils.get_decoder_self_attention_bias(length)

        self.assertAllEqual(
            [[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
               [0, 0, NEG_INF, NEG_INF, NEG_INF], [0, 0, 0, NEG_INF, NEG_INF],
               [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]]]], bias)
    def decode(self, targets, encoder_outputs, training):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence. int tensor with shape
        [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence. float tensor
        with shape [batch_size, input_length, hidden_size]
      training: boolean, whether in training mode or not.

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope('decode'):
            length = tf.shape(targets)[1]
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            encoder_shape = tf.shape(encoder_outputs)
            # [batch, 1] as there is only one object as input for decoding.
            mask = tf.ones([encoder_shape[0], encoder_shape[1]])
            # In mask, 1 = valid object, 0 = padding, attn_bias will have -NEG_INF for
            # paddings and 0 for valid objects.
            attention_bias = model_utils.get_padding_bias(mask)

            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            targets = tf.pad(targets, [[0, 0], [1, 0]],
                             constant_values=input_utils.START)
            # Remove last element.
            targets = targets[:, :-1]
            decoder_inputs = self._word_embedding_layer(targets)

            # No need to shift, use START above to shift.
            # with tf.name_scope('shift_targets'):
            #   # Shift targets to the right, and remove the last element
            #   decoder_inputs = tf.pad(decoder_inputs,
            #                           [[0, 0], [1, 0], [0, 0]])[:, :-1, :]

            with tf.name_scope('add_pos_encoding'):
                pos_encoding = self._position_embedding_layer(decoder_inputs)
                decoder_inputs += pos_encoding

            if training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    rate=self._hparams['layer_postprocess_dropout'])

            decoder_outputs = self._decoder(decoder_inputs,
                                            encoder_outputs,
                                            decoder_self_attention_bias,
                                            attention_bias,
                                            training=training)
            logits = self._word_layer(decoder_outputs)
            return logits
Ejemplo n.º 3
0
    def _get_symbols_to_logits_fn(self, max_decode_length, training):
        """Returns a decoding function that calculates logits of the next tokens."""
        timing_signal = self.position_embedding(inputs=None,
                                                length=max_decode_length + 1)
        timing_signal = tf.cast(timing_signal, self.params["dtype"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length, dtype=self.params["dtype"])

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences. int tensor with shape [batch_size *
          beam_size, i + 1].
        i: Loop index.
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i]
            if self.params["padded_decode"]:
                bias_shape = decoder_self_attention_bias.shape.as_list()
                self_attention_bias = tf.slice(
                    decoder_self_attention_bias, [0, 0, i, 0],
                    [bias_shape[0], bias_shape[1], 1, bias_shape[3]])
            else:
                self_attention_bias = decoder_self_attention_bias[:, :, i:i +
                                                                  1, :i + 1]

            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"),
                self_attention_bias,
                cache.get("encoder_decoder_attention_bias"),
                training=training,
                cache=cache,
                decode_loop_step=i if self.params["padded_decode"] else None)
            logits = self.embedding_softmax_layer(decoder_outputs,
                                                  mode="linear")
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn
    def _get_symbols_to_logits_fn(self, max_decode_length, training):
        """Returns a decoding function that calculates logits of the next tokens."""
        timing_signal = self._position_embedding_layer(
            inputs=None, length=max_decode_length)
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences. int tensor with shape [batch_size *
          beam_size, i + 1].
        i: Loop index.
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
            # Set decoder input to the last generated IDs. The previous ids attention
            # key/value are already stored in the cache.
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self._word_embedding_layer(decoder_input)

            decoder_input += timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            decoder_outputs = self._decoder(
                decoder_input,
                cache.get('encoder_outputs'),
                self_attention_bias,
                cache.get('encoder_decoder_attention_bias'),
                training=training,
                cache=cache)
            # Only use the last decoded state.
            decoder_outputs = decoder_outputs[:, -1, :]
            logits = self._word_layer(decoder_outputs)
            return logits, cache

        return symbols_to_logits_fn
    def decode(self, targets, encoder_outputs, encoder_attn_bias, input_shape,
               training):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence. int tensor with shape
        [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence. float tensor
        with shape [batch_size, input_length, hidden_size]
      encoder_attn_bias: encoder attention bias.
      input_shape: the shape of current data batch.
      training: boolean, whether in training mode or not.

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope('decode'):
            length = tf.shape(targets)[1]
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            encoder_outputs = tf.reshape(
                encoder_outputs,
                [input_shape[0], -1, self._hparams['hidden_size']])
            decoder_inputs = tf.pad(targets, [[0, 0], [1, 0]],
                                    constant_values=input_utils.START)

            # Remove last element.
            decoder_inputs = decoder_inputs[:, :-1]
            decoder_inputs = self._word_embedding_layer(decoder_inputs)

            with tf.name_scope('add_pos_encoding'):
                pos_encoding = self._position_embedding_layer(decoder_inputs)
                decoder_inputs += pos_encoding

            if training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    rate=self._hparams['layer_postprocess_dropout'])

            decoder_outputs = self._decoder(decoder_inputs,
                                            encoder_outputs,
                                            decoder_self_attention_bias,
                                            encoder_attn_bias,
                                            training=training)
            logits = self._word_layer(decoder_outputs)
            return logits
Ejemplo n.º 6
0
    def decode(self, targets, encoder_outputs, attention_bias, training):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence. int tensor with shape
        [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence. float tensor
        with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]
      training: boolean, whether in training mode or not.

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
            decoder_inputs = self.embedding_softmax_layer(targets)
            decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"])
            attention_bias = tf.cast(attention_bias, self.params["dtype"])
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                pos_encoding = self.position_embedding(decoder_inputs)
                pos_encoding = tf.cast(pos_encoding, self.params["dtype"])
                decoder_inputs += pos_encoding
            if training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    rate=self.params["layer_postprocess_dropout"])

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length, dtype=self.params["dtype"])
            outputs = self.decoder_stack(decoder_inputs,
                                         encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias,
                                         training=training)
            logits = self.embedding_softmax_layer(outputs, mode="linear")
            logits = tf.cast(logits, tf.float32)
            return logits
Ejemplo n.º 7
0
def get_attention_bias(input_tensor,
                       bias_type,
                       padding_value=0,
                       max_length=None):
    """A helper function to get various attention bias tensors."""
    if bias_type not in ("single_cross", "multi_cross", "decoder_self"):
        raise ValueError("Invalid attention bias type: %s" % bias_type)
    if bias_type == "single_cross":
        length = tf_utils.get_shape_list(input_tensor, expected_rank=2)[1]
        bias = transformer_utils.get_padding_bias(input_tensor,
                                                  padding_value=padding_value)
    elif bias_type == "multi_cross":
        length = tf_utils.get_shape_list(input_tensor, expected_rank=3)[2]
        padding = transformer_utils.get_padding(input_tensor,
                                                padding_value=padding_value)
        bias = padding * -1e9
    else:
        if max_length is not None:
            length = max_length
        else:
            length = tf_utils.get_shape_list(input_tensor, expected_rank=2)[1]
        bias = transformer_utils.get_decoder_self_attention_bias(length)

    return tf.where(bias < 0, tf.zeros_like(bias), tf.ones_like(bias))