def test_padded_decode(self): """Test with a mask tensor.""" num_heads, head_size = 2, 2 from_seq_length = 4 # TPU decoding should pre-allocate the entire sequence. batch_size = 3 init_decode_length = from_seq_length # Directly tests the keras layer. cache = _create_cache(batch_size, init_decode_length, num_heads, head_size) layer = attention.CachedAttention(num_heads=num_heads, key_size=head_size) # Generate data for the input (non-mask) tensors. from_data = tf.zeros((batch_size, from_seq_length, 8), dtype=np.float32) decode_loop_step = 2 mask_data = np.random.randint(2, size=(batch_size, from_seq_length, from_seq_length), dtype=np.int32) # Testing the invocation directly as Keras cannot consume inputs correctly. masked_output_data, cache = layer([from_data, from_data], mask_data, cache, decode_loop_step=decode_loop_step) self.assertEqual(masked_output_data.shape, (3, 4, 8)) self.assertEqual(cache["value"].shape, (3, 4, 2, 2))
def test_masked_attention(self): """Test with a mask tensor.""" num_heads, head_size = 2, 2 # Create a 3-dimensional input (the first dimension is implicit). from_seq_length = 4 batch_size = 3 # GPU/CPU case. init_decode_length = 0 # Directly tests the keras layer. cache = _create_cache(batch_size, init_decode_length, num_heads, head_size) layer = attention.CachedAttention(num_heads=num_heads, key_size=head_size) # Generate data for the input (non-mask) tensors. from_data = tf.zeros((batch_size, from_seq_length, 8), dtype=np.float32) # Invoke the data with a random set of mask data. This should mask at least # one element. mask_data = np.random.randint(2, size=(batch_size, from_seq_length, from_seq_length)) masked_output_data, cache = layer([from_data, from_data], mask_data, cache) self.assertEqual(masked_output_data.shape, (3, 4, 8)) self.assertEqual(cache["value"].shape, (3, 4, 2, 2)) # Tests inputs without cache. masked_output_data, cache = layer([from_data, from_data, mask_data]) self.assertEqual(masked_output_data.shape, (3, 4, 8)) self.assertIsNone(cache)
def build(self, input_shape): # Self attention. self.self_attention = attention.CachedAttention( num_heads=self.num_attention_heads, key_size=self.attention_head_size, dropout=self.attention_probs_dropout_prob, kernel_initializer=self._kernel_initializer, name="self_attention") self.self_attention_output_dense = dense_einsum.DenseEinsum( output_shape=self.hidden_size, num_summed_dimensions=2, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, name="self_attention_output") self.self_attention_dropout = tf.keras.layers.Dropout( rate=self.hidden_dropout_prob) self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization( name="self_attention_layer_norm", axis=-1, epsilon=1e-12)) # Encoder-decoder attention. self.encdec_attention = self._cross_attention_cls( num_heads=self.num_attention_heads, key_size=self.attention_head_size, dropout=self.attention_probs_dropout_prob, output_shape=self.hidden_size, kernel_initializer=self._kernel_initializer, name="attention/encdec") self.encdec_attention_dropout = tf.keras.layers.Dropout( rate=self.hidden_dropout_prob) self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization( name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12)) # Feed-forward projection. self.intermediate_dense = dense_einsum.DenseEinsum( output_shape=self.intermediate_size, activation=None, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, name="intermediate") self.intermediate_activation_layer = tf.keras.layers.Activation( self.intermediate_activation) self.output_dense = dense_einsum.DenseEinsum( output_shape=self.hidden_size, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, name="output") self.output_dropout = tf.keras.layers.Dropout( rate=self.hidden_dropout_prob) self.output_layer_norm = tf.keras.layers.LayerNormalization( name="output_layer_norm", axis=-1, epsilon=1e-12) super(TransformerDecoderLayer, self).build(input_shape)
def build(self, input_shape): target_tensor_shape = tf.TensorShape(input_shape[0]) if len(target_tensor_shape.as_list()) != 3: raise ValueError( "TransformerLayer expects a three-dimensional input of " "shape [batch, sequence, width].") hidden_size = target_tensor_shape[2] if hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, self.num_attention_heads)) self.attention_head_size = int(hidden_size / self.num_attention_heads) common_kwargs = dict(bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint) # Self attention. self.self_attention = attention.CachedAttention( num_heads=self.num_attention_heads, key_dim=self.attention_head_size, dropout=self.attention_dropout_rate, use_bias=self._use_bias, kernel_initializer=self._attention_initializer, name="self_attention", **common_kwargs) self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense( "abc,cd->abd", output_shape=(None, hidden_size), bias_axes="d", kernel_initializer=self._kernel_initializer, name="output", **common_kwargs) self.self_attention_dropout = tf.keras.layers.Dropout( rate=self.dropout_rate) self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization( name="self_attention_layer_norm", axis=-1, epsilon=self._norm_epsilon)) # Encoder-decoder attention. self.encdec_attention = self._cross_attention_cls( num_heads=self.num_attention_heads, key_dim=self.attention_head_size, dropout=self.attention_dropout_rate, output_shape=hidden_size, use_bias=self._use_bias, kernel_initializer=self._attention_initializer, name="attention/encdec", **common_kwargs) self.encdec_attention_dropout = tf.keras.layers.Dropout( rate=self.dropout_rate) self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization( name="attention/encdec_output_layer_norm", axis=-1, epsilon=self._norm_epsilon)) # Feed-forward projection. self.intermediate_dense = tf.keras.layers.experimental.EinsumDense( "abc,cd->abd", output_shape=(None, self.intermediate_size), bias_axes="d", kernel_initializer=self._kernel_initializer, name="intermediate", **common_kwargs) self.intermediate_activation_layer = tf.keras.layers.Activation( self.intermediate_activation) self._intermediate_dropout_layer = tf.keras.layers.Dropout( rate=self._intermediate_dropout) self.output_dense = tf.keras.layers.experimental.EinsumDense( "abc,cd->abd", output_shape=(None, hidden_size), bias_axes="d", kernel_initializer=self._kernel_initializer, name="output", **common_kwargs) self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate) self.output_layer_norm = tf.keras.layers.LayerNormalization( name="output_layer_norm", axis=-1, epsilon=self._norm_epsilon) super().build(input_shape)
def build(self, input_shape): target_tensor_shape = tf.TensorShape(input_shape[0]) if len(target_tensor_shape) != 3: raise ValueError( "TransformerLayer expects a three-dimensional input of " "shape [batch, sequence, width].") hidden_size = target_tensor_shape[2] if hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, self.num_attention_heads)) self.attention_head_size = int(hidden_size / self.num_attention_heads) # Self attention. self.self_attention = attention.CachedAttention( num_heads=self.num_attention_heads, key_size=self.attention_head_size, dropout=self.attention_dropout_rate, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="self_attention") self.self_attention_output_dense = dense_einsum.DenseEinsum( output_shape=hidden_size, num_summed_dimensions=2, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="self_attention_output") self.self_attention_dropout = tf.keras.layers.Dropout( rate=self.dropout_rate) self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization( name="self_attention_layer_norm", axis=-1, epsilon=1e-12)) # Encoder-decoder attention. self.encdec_attention = self._cross_attention_cls( num_heads=self.num_attention_heads, key_size=self.attention_head_size, dropout=self.attention_dropout_rate, output_shape=hidden_size, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="attention/encdec") self.encdec_attention_dropout = tf.keras.layers.Dropout( rate=self.dropout_rate) self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization( name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12)) # Feed-forward projection. self.intermediate_dense = dense_einsum.DenseEinsum( output_shape=self.intermediate_size, activation=None, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="intermediate") self.intermediate_activation_layer = tf.keras.layers.Activation( self.intermediate_activation) self.output_dense = dense_einsum.DenseEinsum( output_shape=hidden_size, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="output") self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate) self.output_layer_norm = tf.keras.layers.LayerNormalization( name="output_layer_norm", axis=-1, epsilon=1e-12) super(TransformerDecoderLayer, self).build(input_shape)