Ejemplo n.º 1
0
    def _transform(self, inputs, encoder_self_attention_bias, **kwargs):
        """ Encodes the inputs.

        Args:
            inputs: A Tensor, [batch_size, timesteps, d_model]
            encoder_self_attention_bias: A Tensor, FLOAT_MIN
              for padding, 0 for non-padding, [batch_size, 1, 1, timesteps].
            **kwargs:

        Returns: A Tensor, the transformed hidden
          state of TransformerEncoder, [batch_size, timesteps, d_model].

        """
        input_padding = attention_bias_to_padding(encoder_self_attention_bias)
        pad_remover = PadRemover(input_padding)
        x = dropout_wrapper(inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        encoder_self_attention_scores = []
        for layer in range(self.params["num_layers"]):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=encoder_self_attention_bias)
                    encoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=pad_remover,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, encoder_self_attention_scores
Ejemplo n.º 2
0
    def _transform(self, inputs, encoder_self_attention_bias, **kwargs):
        """ Encodes the inputs.

        Args:
            inputs: A Tensor, [batch_size, timesteps, d_model]
            encoder_self_attention_bias: A Tensor, FLOAT_MIN
              for padding, 0 for non-padding, [batch_size, 1, 1, timesteps].
            **kwargs:

        Returns: A Tensor, the transformed hidden
          state of TransformerEncoder, [batch_size, timesteps, d_model].

        """
        input_padding = attention_bias_to_padding(encoder_self_attention_bias)
        pad_remover = PadRemover(input_padding)
        x = dropout_wrapper(inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        encoder_self_attention_scores = []
        for layer in range(self.params["num_layers"]):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=encoder_self_attention_bias)
                    encoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=pad_remover,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, encoder_self_attention_scores
Ejemplo n.º 3
0
    def att_fn(self, query, keys, bias=None):
        """ Computes attention scores.

        Args:
            query: Attention query tensor with shape
              [batch_size, channels_query]
            keys: Attention keys tensor with shape
              [batch_size, num_of_keys, channels_key]
            bias: The bias tensor for attention keys

        Returns: A Tensor, [batch_size, num_of_keys]
        """
        v_att = tf.get_variable("v_att", shape=[self.params["num_units"]], dtype=tf.float32)
        logits = tf.reduce_sum(v_att * tf.tanh(keys + tf.expand_dims(query, 1)), [2])
        if bias is not None:
            logits += bias
        attention_scores = advanced_softmax(logits)
        attention_scores = dropout_wrapper(attention_scores, self.params["dropout_attention_keep_prob"])
        return attention_scores
def dot_product_attention(q, k, bias=None, dropout_keep_prob=1.0):
    """ Computes attention weight according to query and key.

    Args:
        q: A query Tensor with shape [..., length_q, depth].
        k: A keys Tensor with shape [..., length_k, depth].
        bias: A bias Tensor with shape [..., 1, depth].
        dropout_keep_prob: A float scalar.

    Returns: The attention scores Tensor with shape
      [..., length_q, length_k].
    """
    with tf.variable_scope("dot_product_attention", values=[q, k]):
        logits = tf.matmul(q, k, transpose_b=True)
        if bias is not None:
            logits += bias
        weights = advanced_softmax(logits)
        # dropout the attention links for each of the heads
        weights = dropout_wrapper(weights, keep_prob=dropout_keep_prob)
        return weights
Ejemplo n.º 5
0
    def _dot_product_attention(self, q, k, bias):
        """ Computes attention weight according to query and key.

        Args:
            q: A query Tensor with shape [batch_size, num_heads, length_q, depth / num_heads].
            k: A keys Tensor with shape [batch_size, num_heads, length_k, depth / num_heads].
            bias: A bias Tensor with shape [batch_size, 1, 1, depth / num_heads].

        Returns: The attention scores Tensor with shape
          [batch_size, num_heads, length_q, length_k].
        """
        with tf.variable_scope("dot_product_attention", values=[q, k]):
            logits = tf.matmul(q, k, transpose_b=True)
            if bias is not None:
                logits += bias
            weights = algebra_ops.advanced_softmax(logits)
            # dropout the attention links for each of the heads
            weights = dropout_wrapper(
                weights, keep_prob=self._dropout_attention_keep_prob)
            return weights
Ejemplo n.º 6
0
def dot_product_attention(q, k, bias=None, dropout_keep_prob=1.0):
    """ Computes attention weight according to query and key.

    Args:
        q: A query Tensor with shape [..., length_q, depth].
        k: A keys Tensor with shape [..., length_k, depth].
        bias: A bias Tensor with shape [..., 1, depth].
        dropout_keep_prob: A float scalar.

    Returns: The attention scores Tensor with shape
      [..., length_q, length_k].
    """
    with tf.variable_scope("dot_product_attention", values=[q, k]):
        logits = tf.matmul(q, k, transpose_b=True)
        if bias is not None:
            logits += bias
        weights = advanced_softmax(logits)
        # dropout the attention links for each of the heads
        weights = dropout_wrapper(weights, keep_prob=dropout_keep_prob)
        return weights
    def att_fn(self, query, keys, bias=None):
        """ Computes attention scores.

        Args:
            query: Attention query tensor with shape
              [batch_size, channels_query]
            keys: Attention keys tensor with shape
              [batch_size, num_of_keys, channels_key]
            bias: The bias tensor for attention keys

        Returns: A Tensor, [batch_size, num_of_keys]
        """
        v_att = tf.get_variable("v_att",
                                shape=[self.params["num_units"]],
                                dtype=tf.float32)
        logits = tf.reduce_sum(
            v_att * tf.tanh(keys + tf.expand_dims(query, 1)), [2])
        if bias is not None:
            logits += bias
        attention_scores = advanced_softmax(logits)
        attention_scores = dropout_wrapper(
            attention_scores, self.params["dropout_attention_keep_prob"])
        return attention_scores
Ejemplo n.º 8
0
    def _transform(self, decoder_inputs, cache):
        """ Decodes one step

        Args:
            decoder_input: The decoder input for this timestep,
              A Tensor, with shape [batch_size, timesteps, dmodel].
              Note that when mode==INFER, timesteps=1.
            cache: A dict containing decoding states at previous
              timestep, attention values and attention length.

        Returns: A transformed Tensor.
        """
        # [batch_size, max_len_src, dim]
        encdec_attention_values = cache["memory"]
        # [batch_size, 1, 1, max_len_src]
        encdec_attention_bias = cache["memory_bias"]

        decoder_self_attention_scores = []
        encdec_attention_scores = []

        # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg]
        decoder_self_attention_bias = attention_bias_lower_triangle(
            tf.shape(decoder_inputs)[1])
        x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        for layer in range(self.params["num_layers"]):
            layer_name = "layer_{}".format(layer)
            layer_cache = None if cache["decoding_states"] is None \
                else cache["decoding_states"][layer_name]
            selfatt_cache = None if layer_cache is None \
                else layer_cache["self_attention"]
            encdecatt_cache = None if layer_cache is None \
                else layer_cache["encdec_attention"]
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=decoder_self_attention_bias,
                        cache=selfatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    decoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("encdec_attention"):
                    # encoder-decoder attention
                    w_y, y = self._encdec_attention_layers[layer].build(
                        query=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory=encdec_attention_values,
                        memory_bias=encdec_attention_bias,
                        cache=encdecatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    encdec_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=None,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, decoder_self_attention_scores, encdec_attention_scores
Ejemplo n.º 9
0
    def _transform(self, decoder_inputs, decoding_params):
        """ Decodes one step

        Args:
            decoder_input: The decoder input for this timestep, an
              instance of `tf.Tensor`, [batch_size, timesteps, dmodel].
            decoding_params: The same as `decoding_params` returned
              from `prepare()` function.

        Returns: A Tensor, the transformed hidden
          state of TransformerDecoder.
        """
        # [batch_size, max_len_src, dim]
        encdec_attention_values = decoding_params[0]
        # [batch_size, ]
        # encdec_attention_length = decoding_params[1]
        # [batch_size, 1, 1, max_len_src]
        encdec_attention_bias = decoding_params[2]

        # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg]
        decoder_self_attention_bias = attention_bias_lower_triangle(
            tf.shape(decoder_inputs)[1])
        x = dropout_wrapper(
            decoder_inputs,
            self.params["layer_prepostprocess_dropout_keep_prob"])
        for layer in range(self.params["num_layers"]):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = multihead_attention_layer(
                        params=self.params["selfattention.params"],
                        mode=self.mode,
                        query_antecedent=None,
                        memory_antecedent=layer_preprocess(
                            x=x,
                            process_sequence=self.
                            params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.
                            params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=decoder_self_attention_bias)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y,
                        previous_x=x,
                        process_sequence=self.
                        params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.
                        params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("encdec_attention"):
                    # encoder-decoder attention
                    w_y, y = multihead_attention_layer(
                        params=self.params["attention.params"],
                        mode=self.mode,
                        query_antecedent=layer_preprocess(
                            x=x,
                            process_sequence=self.
                            params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.
                            params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_antecedent=encdec_attention_values,
                        memory_bias=encdec_attention_bias)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y,
                        previous_x=x,
                        process_sequence=self.
                        params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.
                        params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x,
                            process_sequence=self.
                            params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.
                            params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=None,
                        dropout_relu_keep_prob=self.
                        params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y,
                        previous_x=x,
                        process_sequence=self.
                        params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.
                        params["layer_prepostprocess_dropout_keep_prob"])
        return layer_preprocess(
            x=x,
            process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.
            params["layer_prepostprocess_dropout_keep_prob"])
Ejemplo n.º 10
0
    def _transform(self, decoder_inputs, cache, pad_remover=None):
        """ Decodes one step

        Args:
            decoder_inputs: The decoder input for this timestep,
              A Tensor, with shape [batch_size, timesteps, dmodel].
              Note that when mode==INFER, timesteps=1.
            cache: A dict containing decoding states at previous
              timestep, attention values and attention length.
            pad_remover: An expert_utils.PadRemover object tracking the padding
              positions. If provided, the padding is removed before applying
              the convolution, and restored afterward. This can give a significant
              speedup (says Google's tensor2tensor code).

        Returns: A transformed Tensor.
        """
        # [batch_size, max_len_src, dim]
        encdec_attention_values = cache["memory"]
        # [batch_size, 1, 1, max_len_src]
        encdec_attention_bias = cache["memory_bias"]

        decoder_self_attention_scores = []
        encdec_attention_scores = []

        # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg]
        decoder_self_attention_bias = attention_bias_lower_triangle(
            tf.shape(decoder_inputs)[1])
        x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        for layer in range(self.params["num_layers"]):
            layer_name = "layer_{}".format(layer)
            layer_cache = None if cache["decoding_states"] is None \
                else cache["decoding_states"][layer_name]
            selfatt_cache = None if layer_cache is None \
                else layer_cache["self_attention"]
            encdecatt_cache = None if layer_cache is None \
                else layer_cache["encdec_attention"]
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=decoder_self_attention_bias,
                        cache=selfatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    decoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("encdec_attention"):
                    # encoder-decoder attention
                    w_y, y = self._encdec_attention_layers[layer].build(
                        query=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory=encdec_attention_values,
                        memory_bias=encdec_attention_bias,
                        cache=encdecatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    encdec_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=pad_remover,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, decoder_self_attention_scores, encdec_attention_scores