def _transform(self, inputs, encoder_self_attention_bias, **kwargs):
        """ Encodes the inputs.

        Args:
            inputs: A Tensor, [batch_size, timesteps, d_model]
            encoder_self_attention_bias: A Tensor, FLOAT_MIN
              for padding, 0 for non-padding, [batch_size, 1, 1, timesteps].
            **kwargs:

        Returns: A Tensor, the transformed hidden
          state of TransformerEncoder, [batch_size, timesteps, d_model].

        """
        input_padding = attention_bias_to_padding(encoder_self_attention_bias)
        pad_remover = PadRemover(input_padding)
        x = dropout_wrapper(inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        encoder_self_attention_scores = []
        for layer in range(self.params["num_layers"]):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=encoder_self_attention_bias)
                    encoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=pad_remover,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, encoder_self_attention_scores
    def _transform(self, inputs, encoder_self_attention_bias, **kwargs):
        """ Encodes the inputs.

        Args:
            inputs: A Tensor, [batch_size, timesteps, d_model]
            encoder_self_attention_bias: A Tensor, FLOAT_MIN
              for padding, 0 for non-padding, [batch_size, 1, 1, timesteps].
            **kwargs:

        Returns: A Tensor, the transformed hidden
          state of TransformerEncoder, [batch_size, timesteps, d_model].

        """
        input_padding = attention_bias_to_padding(encoder_self_attention_bias)
        pad_remover = PadRemover(input_padding)
        x = dropout_wrapper(inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        encoder_self_attention_scores = []
        for layer in range(self.params["num_layers"]):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=encoder_self_attention_bias)
                    encoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=pad_remover,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, encoder_self_attention_scores
    def _transform(self, decoder_inputs, cache):
        """ Decodes one step

        Args:
            decoder_input: The decoder input for this timestep,
              A Tensor, with shape [batch_size, timesteps, dmodel].
              Note that when mode==INFER, timesteps=1.
            cache: A dict containing decoding states at previous
              timestep, attention values and attention length.

        Returns: A transformed Tensor.
        """
        # [batch_size, max_len_src, dim]
        encdec_attention_values = cache["memory"]
        # [batch_size, 1, 1, max_len_src]
        encdec_attention_bias = cache["memory_bias"]

        decoder_self_attention_scores = []
        encdec_attention_scores = []

        # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg]
        decoder_self_attention_bias = attention_bias_lower_triangle(
            tf.shape(decoder_inputs)[1])
        x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        for layer in range(self.params["num_layers"]):
            layer_name = "layer_{}".format(layer)
            layer_cache = None if cache["decoding_states"] is None \
                else cache["decoding_states"][layer_name]
            selfatt_cache = None if layer_cache is None \
                else layer_cache["self_attention"]
            encdecatt_cache = None if layer_cache is None \
                else layer_cache["encdec_attention"]
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=decoder_self_attention_bias,
                        cache=selfatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    decoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("encdec_attention"):
                    # encoder-decoder attention
                    w_y, y = self._encdec_attention_layers[layer].build(
                        query=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory=encdec_attention_values,
                        memory_bias=encdec_attention_bias,
                        cache=encdecatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    encdec_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=None,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, decoder_self_attention_scores, encdec_attention_scores
Beispiel #4
0
    def _transform(self, decoder_inputs, decoding_params):
        """ Decodes one step

        Args:
            decoder_input: The decoder input for this timestep, an
              instance of `tf.Tensor`, [batch_size, timesteps, dmodel].
            decoding_params: The same as `decoding_params` returned
              from `prepare()` function.

        Returns: A Tensor, the transformed hidden
          state of TransformerDecoder.
        """
        # [batch_size, max_len_src, dim]
        encdec_attention_values = decoding_params[0]
        # [batch_size, ]
        # encdec_attention_length = decoding_params[1]
        # [batch_size, 1, 1, max_len_src]
        encdec_attention_bias = decoding_params[2]

        # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg]
        decoder_self_attention_bias = attention_bias_lower_triangle(
            tf.shape(decoder_inputs)[1])
        x = dropout_wrapper(
            decoder_inputs,
            self.params["layer_prepostprocess_dropout_keep_prob"])
        for layer in range(self.params["num_layers"]):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = multihead_attention_layer(
                        params=self.params["selfattention.params"],
                        mode=self.mode,
                        query_antecedent=None,
                        memory_antecedent=layer_preprocess(
                            x=x,
                            process_sequence=self.
                            params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.
                            params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=decoder_self_attention_bias)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y,
                        previous_x=x,
                        process_sequence=self.
                        params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.
                        params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("encdec_attention"):
                    # encoder-decoder attention
                    w_y, y = multihead_attention_layer(
                        params=self.params["attention.params"],
                        mode=self.mode,
                        query_antecedent=layer_preprocess(
                            x=x,
                            process_sequence=self.
                            params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.
                            params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_antecedent=encdec_attention_values,
                        memory_bias=encdec_attention_bias)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y,
                        previous_x=x,
                        process_sequence=self.
                        params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.
                        params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x,
                            process_sequence=self.
                            params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.
                            params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=None,
                        dropout_relu_keep_prob=self.
                        params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y,
                        previous_x=x,
                        process_sequence=self.
                        params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.
                        params["layer_prepostprocess_dropout_keep_prob"])
        return layer_preprocess(
            x=x,
            process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.
            params["layer_prepostprocess_dropout_keep_prob"])
    def _transform(self, decoder_inputs, cache, pad_remover=None):
        """ Decodes one step

        Args:
            decoder_inputs: The decoder input for this timestep,
              A Tensor, with shape [batch_size, timesteps, dmodel].
              Note that when mode==INFER, timesteps=1.
            cache: A dict containing decoding states at previous
              timestep, attention values and attention length.
            pad_remover: An expert_utils.PadRemover object tracking the padding
              positions. If provided, the padding is removed before applying
              the convolution, and restored afterward. This can give a significant
              speedup (says Google's tensor2tensor code).

        Returns: A transformed Tensor.
        """
        # [batch_size, max_len_src, dim]
        encdec_attention_values = cache["memory"]
        # [batch_size, 1, 1, max_len_src]
        encdec_attention_bias = cache["memory_bias"]

        decoder_self_attention_scores = []
        encdec_attention_scores = []

        # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg]
        decoder_self_attention_bias = attention_bias_lower_triangle(
            tf.shape(decoder_inputs)[1])
        x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"])
        for layer in range(self.params["num_layers"]):
            layer_name = "layer_{}".format(layer)
            layer_cache = None if cache["decoding_states"] is None \
                else cache["decoding_states"][layer_name]
            selfatt_cache = None if layer_cache is None \
                else layer_cache["self_attention"]
            encdecatt_cache = None if layer_cache is None \
                else layer_cache["encdec_attention"]
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    # self attention layer
                    w_y, y = self._self_attention_layers[layer].build(
                        query=None,
                        memory=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory_bias=decoder_self_attention_bias,
                        cache=selfatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    decoder_self_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("encdec_attention"):
                    # encoder-decoder attention
                    w_y, y = self._encdec_attention_layers[layer].build(
                        query=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        memory=encdec_attention_values,
                        memory_bias=encdec_attention_bias,
                        cache=encdecatt_cache)
                    # [batch_size, num_heads, length_q, length_k]
                    encdec_attention_scores.append(w_y)
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        x=layer_preprocess(
                            x=x, process_sequence=self.params["layer_preprocess_sequence"],
                            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]),
                        filter_size=self.params["num_filter_units"],
                        output_size=self.params["num_hidden_units"],
                        pad_remover=pad_remover,
                        dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"])
                    # apply dropout, layer norm, residual
                    x = layer_postprocessing(
                        x=y, previous_x=x,
                        process_sequence=self.params["layer_postprocess_sequence"],
                        dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        x = layer_preprocess(
            x=x, process_sequence=self.params["layer_preprocess_sequence"],
            dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"])
        return x, decoder_self_attention_scores, encdec_attention_scores