def _transform(self, inputs, encoder_self_attention_bias, **kwargs): """ Encodes the inputs. Args: inputs: A Tensor, [batch_size, timesteps, d_model] encoder_self_attention_bias: A Tensor, FLOAT_MIN for padding, 0 for non-padding, [batch_size, 1, 1, timesteps]. **kwargs: Returns: A Tensor, the transformed hidden state of TransformerEncoder, [batch_size, timesteps, d_model]. """ input_padding = attention_bias_to_padding(encoder_self_attention_bias) pad_remover = PadRemover(input_padding) x = dropout_wrapper(inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) encoder_self_attention_scores = [] for layer in range(self.params["num_layers"]): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = self._self_attention_layers[layer].build( query=None, memory=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=encoder_self_attention_bias) encoder_self_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=pad_remover, dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) x = layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) return x, encoder_self_attention_scores
def att_fn(self, query, keys, bias=None): """ Computes attention scores. Args: query: Attention query tensor with shape [batch_size, channels_query] keys: Attention keys tensor with shape [batch_size, num_of_keys, channels_key] bias: The bias tensor for attention keys Returns: A Tensor, [batch_size, num_of_keys] """ v_att = tf.get_variable("v_att", shape=[self.params["num_units"]], dtype=tf.float32) logits = tf.reduce_sum(v_att * tf.tanh(keys + tf.expand_dims(query, 1)), [2]) if bias is not None: logits += bias attention_scores = advanced_softmax(logits) attention_scores = dropout_wrapper(attention_scores, self.params["dropout_attention_keep_prob"]) return attention_scores
def dot_product_attention(q, k, bias=None, dropout_keep_prob=1.0): """ Computes attention weight according to query and key. Args: q: A query Tensor with shape [..., length_q, depth]. k: A keys Tensor with shape [..., length_k, depth]. bias: A bias Tensor with shape [..., 1, depth]. dropout_keep_prob: A float scalar. Returns: The attention scores Tensor with shape [..., length_q, length_k]. """ with tf.variable_scope("dot_product_attention", values=[q, k]): logits = tf.matmul(q, k, transpose_b=True) if bias is not None: logits += bias weights = advanced_softmax(logits) # dropout the attention links for each of the heads weights = dropout_wrapper(weights, keep_prob=dropout_keep_prob) return weights
def _dot_product_attention(self, q, k, bias): """ Computes attention weight according to query and key. Args: q: A query Tensor with shape [batch_size, num_heads, length_q, depth / num_heads]. k: A keys Tensor with shape [batch_size, num_heads, length_k, depth / num_heads]. bias: A bias Tensor with shape [batch_size, 1, 1, depth / num_heads]. Returns: The attention scores Tensor with shape [batch_size, num_heads, length_q, length_k]. """ with tf.variable_scope("dot_product_attention", values=[q, k]): logits = tf.matmul(q, k, transpose_b=True) if bias is not None: logits += bias weights = algebra_ops.advanced_softmax(logits) # dropout the attention links for each of the heads weights = dropout_wrapper( weights, keep_prob=self._dropout_attention_keep_prob) return weights
def att_fn(self, query, keys, bias=None): """ Computes attention scores. Args: query: Attention query tensor with shape [batch_size, channels_query] keys: Attention keys tensor with shape [batch_size, num_of_keys, channels_key] bias: The bias tensor for attention keys Returns: A Tensor, [batch_size, num_of_keys] """ v_att = tf.get_variable("v_att", shape=[self.params["num_units"]], dtype=tf.float32) logits = tf.reduce_sum( v_att * tf.tanh(keys + tf.expand_dims(query, 1)), [2]) if bias is not None: logits += bias attention_scores = advanced_softmax(logits) attention_scores = dropout_wrapper( attention_scores, self.params["dropout_attention_keep_prob"]) return attention_scores
def _transform(self, decoder_inputs, cache): """ Decodes one step Args: decoder_input: The decoder input for this timestep, A Tensor, with shape [batch_size, timesteps, dmodel]. Note that when mode==INFER, timesteps=1. cache: A dict containing decoding states at previous timestep, attention values and attention length. Returns: A transformed Tensor. """ # [batch_size, max_len_src, dim] encdec_attention_values = cache["memory"] # [batch_size, 1, 1, max_len_src] encdec_attention_bias = cache["memory_bias"] decoder_self_attention_scores = [] encdec_attention_scores = [] # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg] decoder_self_attention_bias = attention_bias_lower_triangle( tf.shape(decoder_inputs)[1]) x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) for layer in range(self.params["num_layers"]): layer_name = "layer_{}".format(layer) layer_cache = None if cache["decoding_states"] is None \ else cache["decoding_states"][layer_name] selfatt_cache = None if layer_cache is None \ else layer_cache["self_attention"] encdecatt_cache = None if layer_cache is None \ else layer_cache["encdec_attention"] with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = self._self_attention_layers[layer].build( query=None, memory=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=decoder_self_attention_bias, cache=selfatt_cache) # [batch_size, num_heads, length_q, length_k] decoder_self_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("encdec_attention"): # encoder-decoder attention w_y, y = self._encdec_attention_layers[layer].build( query=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory=encdec_attention_values, memory_bias=encdec_attention_bias, cache=encdecatt_cache) # [batch_size, num_heads, length_q, length_k] encdec_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=None, dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) x = layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) return x, decoder_self_attention_scores, encdec_attention_scores
def _transform(self, decoder_inputs, decoding_params): """ Decodes one step Args: decoder_input: The decoder input for this timestep, an instance of `tf.Tensor`, [batch_size, timesteps, dmodel]. decoding_params: The same as `decoding_params` returned from `prepare()` function. Returns: A Tensor, the transformed hidden state of TransformerDecoder. """ # [batch_size, max_len_src, dim] encdec_attention_values = decoding_params[0] # [batch_size, ] # encdec_attention_length = decoding_params[1] # [batch_size, 1, 1, max_len_src] encdec_attention_bias = decoding_params[2] # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg] decoder_self_attention_bias = attention_bias_lower_triangle( tf.shape(decoder_inputs)[1]) x = dropout_wrapper( decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) for layer in range(self.params["num_layers"]): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = multihead_attention_layer( params=self.params["selfattention.params"], mode=self.mode, query_antecedent=None, memory_antecedent=layer_preprocess( x=x, process_sequence=self. params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=decoder_self_attention_bias) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self. params["layer_postprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("encdec_attention"): # encoder-decoder attention w_y, y = multihead_attention_layer( params=self.params["attention.params"], mode=self.mode, query_antecedent=layer_preprocess( x=x, process_sequence=self. params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]), memory_antecedent=encdec_attention_values, memory_bias=encdec_attention_bias) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self. params["layer_postprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self. params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=None, dropout_relu_keep_prob=self. params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self. params["layer_postprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"]) return layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self. params["layer_prepostprocess_dropout_keep_prob"])
def _transform(self, decoder_inputs, cache, pad_remover=None): """ Decodes one step Args: decoder_inputs: The decoder input for this timestep, A Tensor, with shape [batch_size, timesteps, dmodel]. Note that when mode==INFER, timesteps=1. cache: A dict containing decoding states at previous timestep, attention values and attention length. pad_remover: An expert_utils.PadRemover object tracking the padding positions. If provided, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup (says Google's tensor2tensor code). Returns: A transformed Tensor. """ # [batch_size, max_len_src, dim] encdec_attention_values = cache["memory"] # [batch_size, 1, 1, max_len_src] encdec_attention_bias = cache["memory_bias"] decoder_self_attention_scores = [] encdec_attention_scores = [] # decoder_self_attention_bias: [1, 1, max_len_trg, max_len_trg] decoder_self_attention_bias = attention_bias_lower_triangle( tf.shape(decoder_inputs)[1]) x = dropout_wrapper(decoder_inputs, self.params["layer_prepostprocess_dropout_keep_prob"]) for layer in range(self.params["num_layers"]): layer_name = "layer_{}".format(layer) layer_cache = None if cache["decoding_states"] is None \ else cache["decoding_states"][layer_name] selfatt_cache = None if layer_cache is None \ else layer_cache["self_attention"] encdecatt_cache = None if layer_cache is None \ else layer_cache["encdec_attention"] with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # self attention layer w_y, y = self._self_attention_layers[layer].build( query=None, memory=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory_bias=decoder_self_attention_bias, cache=selfatt_cache) # [batch_size, num_heads, length_q, length_k] decoder_self_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("encdec_attention"): # encoder-decoder attention w_y, y = self._encdec_attention_layers[layer].build( query=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), memory=encdec_attention_values, memory_bias=encdec_attention_bias, cache=encdecatt_cache) # [batch_size, num_heads, length_q, length_k] encdec_attention_scores.append(w_y) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) with tf.variable_scope("ffn"): y = transformer_ffn_layer( x=layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]), filter_size=self.params["num_filter_units"], output_size=self.params["num_hidden_units"], pad_remover=pad_remover, dropout_relu_keep_prob=self.params["dropout_relu_keep_prob"]) # apply dropout, layer norm, residual x = layer_postprocessing( x=y, previous_x=x, process_sequence=self.params["layer_postprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) x = layer_preprocess( x=x, process_sequence=self.params["layer_preprocess_sequence"], dropout_keep_prob=self.params["layer_prepostprocess_dropout_keep_prob"]) return x, decoder_self_attention_scores, encdec_attention_scores