def _transform(self, inputs, encoder_self_attention_bias, **kwargs): """ Encodes the inputs. Args: inputs: A Tensor, [batch_size, timesteps, d_model] encoder_self_attention_bias: A Tensor, FLOAT_MIN for padding, 0 for non-padding, [batch_size, 1, 1, timesteps]. **kwargs: Returns: A Tensor, the transformed hidden state of TransformerEncoder, [batch_size, timesteps, d_model]. """ input_padding = attention_bias_to_padding(encoder_self_attention_bias) pad_remover = PadRemover(input_padding) x = dropout_wrapper(inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) encoder_self_attention_scores = [] for layer in range(self.params["num_layers"]): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = self._self_attention_layers[layer].build( query=None, memory=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=encoder_self_attention_bias) encoder_self_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=pad_remover, dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) x = layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) return x, encoder_self_attention_scores
def _transform(self, decoder_inputs, cache): """ Decodes one step Args: decoder_input: The decoder input for this timestep, A Tensor, with shape [batch_size, timesteps, dmodel]. Note that when mode==INFER, timesteps=1. cache: A dict containing decoding states at previous timestep, attention values and attention length. Returns: A transformed Tensor. """ # [batch_size, max_len_src, dim] encdec_attention_values = cache["memory"] # [batch_size, 1, 1, max_len_src] encdec_attention_bias = cache["memory_bias"] decoder_self_attention_scores = [] encdec_attention_scores = [] # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg] decoder_self_attention_bias = attention_bias_lower_triangle( tf.shape(decoder_inputs)[1]) x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) for layer in range(self.params["num_layers"]): layer_name = "layer_{}".format(layer) layer_cache = None if cache["decoding_states"] is None \ else cache["decoding_states"][layer_name] selfatt_cache = None if layer_cache is None \ else layer_cache["self_attention"] encdecatt_cache = None if layer_cache is None \ else layer_cache["encdec_attention"] with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = self._self_attention_layers[layer].build( query=None, memory=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=decoder_self_attention_bias, cache=selfatt_cache) # [batch_size, num_heads, length_q, length_k] decoder_self_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("encdec_attention"): # encoder-decoder attention w_y, y = self._encdec_attention_layers[layer].build( query=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory=encdec_attention_values, memory_bias=encdec_attention_bias, cache=encdecatt_cache) # [batch_size, num_heads, length_q, length_k] encdec_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=None, dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) x = layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) return x, decoder_self_attention_scores, encdec_attention_scores
def _transform(self, decoder_inputs, decoding_params): """ Decodes one step Args: decoder_input: The decoder input for this timestep, an instance of `tf.Tensor`, [batch_size, timesteps, dmodel]. decoding_params: The same as `decoding_params` returned from `prepare()` function. Returns: A Tensor, the transformed hidden state of TransformerDecoder. """ # [batch_size, max_len_src, dim] encdec_attention_values = decoding_params[0] # [batch_size, ] # encdec_attention_length = decoding_params[1] # [batch_size, 1, 1, max_len_src] encdec_attention_bias = decoding_params[2] # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg] decoder_self_attention_bias = attention_bias_lower_triangle( tf.shape(decoder_inputs)[1]) x = dropout_wrapper( decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) for layer in range(self.params["num_layers"]): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = multihead_attention_layer( params=self.params["selfattention.params"], mode=self.mode, query_antecedent=None, memory_antecedent=layer_preprocess( x=x, process_sequence=self. params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=decoder_self_attention_bias) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self. params["layer_postprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("encdec_attention"): # encoder-decoder attention w_y, y = multihead_attention_layer( params=self.params["attention.params"], mode=self.mode, query_antecedent=layer_preprocess( x=x, process_sequence=self. params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]), memory_antecedent=encdec_attention_values, memory_bias=encdec_attention_bias) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self. params["layer_postprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self. params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=None, dropout_relu_keep_prob=self. params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self. params["layer_postprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]) return layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"])
def _transform(self, decoder_inputs, cache, pad_remover=None): """ Decodes one step Args: decoder_inputs: The decoder input for this timestep, A Tensor, with shape [batch_size, timesteps, dmodel]. Note that when mode==INFER, timesteps=1. cache: A dict containing decoding states at previous timestep, attention values and attention length. pad_remover: An expert_utils.PadRemover object tracking the padding positions. If provided, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup (says Google's tensor2tensor code). Returns: A transformed Tensor. """ # [batch_size, max_len_src, dim] encdec_attention_values = cache["memory"] # [batch_size, 1, 1, max_len_src] encdec_attention_bias = cache["memory_bias"] decoder_self_attention_scores = [] encdec_attention_scores = [] # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg] decoder_self_attention_bias = attention_bias_lower_triangle( tf.shape(decoder_inputs)[1]) x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) for layer in range(self.params["num_layers"]): layer_name = "layer_{}".format(layer) layer_cache = None if cache["decoding_states"] is None \ else cache["decoding_states"][layer_name] selfatt_cache = None if layer_cache is None \ else layer_cache["self_attention"] encdecatt_cache = None if layer_cache is None \ else layer_cache["encdec_attention"] with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = self._self_attention_layers[layer].build( query=None, memory=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=decoder_self_attention_bias, cache=selfatt_cache) # [batch_size, num_heads, length_q, length_k] decoder_self_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("encdec_attention"): # encoder-decoder attention w_y, y = self._encdec_attention_layers[layer].build( query=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory=encdec_attention_values, memory_bias=encdec_attention_bias, cache=encdecatt_cache) # [batch_size, num_heads, length_q, length_k] encdec_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=pad_remover, dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) x = layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) return x, decoder_self_attention_scores, encdec_attention_scores