def test_high_dim_attention(self, q_dims, v_dims, mask_dims, attention_axes): """Test with a mask tensor.""" test_layer = attention.MultiHeadAttention( num_heads=2, key_size=2, attention_axes=attention_axes) batch_size, hidden_size = 3, 8 # Generate data for the input (non-mask) tensors. query_shape = [batch_size] + q_dims + [hidden_size] value_shape = [batch_size] + v_dims + [hidden_size] mask_shape = [batch_size] + mask_dims query = 10 * np.random.random_sample(query_shape) value = 10 * np.random.random_sample(value_shape) # Invoke the data with a random set of mask data. This should mask at least # one element. mask_data = np.random.randint(2, size=mask_shape).astype("bool") output = test_layer(query=query, value=value, attention_mask=mask_data) # Invoke the same data, but with a null mask (where no elements are masked). null_mask_data = np.ones(mask_shape) unmasked_output = test_layer(query=query, value=value, attention_mask=null_mask_data) # Because one data is masked and one is not, the outputs should not be the # same. self.assertNotAllClose(output, unmasked_output)
def test_non_masked_self_attention(self): """Test with one input (self-attenntion) and no mask tensor.""" test_layer = attention.MultiHeadAttention(num_heads=12, key_size=64) # Create a 3-dimensional input (the first dimension is implicit). query = tf.keras.Input(shape=(40, 80)) output = test_layer([query, query]) self.assertEqual(output.shape.as_list(), [None, 40, 80])
def test_masked_attention(self): """Test with a mask tensor.""" test_layer = attention.MultiHeadAttention(num_heads=2, head_size=2) # Create a 3-dimensional input (the first dimension is implicit). from_tensor = tf.keras.Input(shape=(4, 8)) to_tensor = tf.keras.Input(shape=(2, 8)) mask_tensor = tf.keras.Input(shape=(4, 2)) output = test_layer([from_tensor, to_tensor, mask_tensor]) # Create a model containing the test layer. model = tf.keras.Model([from_tensor, to_tensor, mask_tensor], output) # Generate data for the input (non-mask) tensors. from_data = 10 * np.random.random_sample((3, 4, 8)) to_data = 10 * np.random.random_sample((3, 2, 8)) # Invoke the data with a random set of mask data. This should mask at least # one element. mask_data = np.random.randint(2, size=(3, 4, 2)) masked_output_data = model.predict([from_data, to_data, mask_data]) # Invoke the same data, but with a null mask (where no elements are masked). null_mask_data = np.ones((3, 4, 2)) unmasked_output_data = model.predict( [from_data, to_data, null_mask_data]) # Because one data is masked and one is not, the outputs should not be the # same. self.assertNotAllClose(masked_output_data, unmasked_output_data)
def test_non_masked_attention(self): """Test that the attention layer can be created without a mask tensor.""" test_layer = attention.MultiHeadAttention(num_heads=12, head_size=64) # Create a 3-dimensional input (the first dimension is implicit). from_tensor = tf.keras.Input(shape=(40, 80)) to_tensor = tf.keras.Input(shape=(20, 80)) output = test_layer([from_tensor, to_tensor]) self.assertEqual(output.shape.as_list(), [None, 40, 12, 64])
def test_masked_attention(self, use_bias): """Test with a mask tensor.""" test_layer = attention.MultiHeadAttention(num_heads=2, key_size=2, use_bias=use_bias) # Create a 3-dimensional input (the first dimension is implicit). batch_size = 3 query = tf.keras.Input(shape=(4, 8)) value = tf.keras.Input(shape=(2, 8)) mask_tensor = tf.keras.Input(shape=(4, 2)) output = test_layer(query=query, value=value, attention_mask=mask_tensor) # Create a model containing the test layer. model = tf.keras.Model([query, value, mask_tensor], output) # Generate data for the input (non-mask) tensors. from_data = 10 * np.random.random_sample((batch_size, 4, 8)) to_data = 10 * np.random.random_sample((batch_size, 2, 8)) # Invoke the data with a random set of mask data. This should mask at least # one element. mask_data = np.random.randint(2, size=(batch_size, 4, 2)) masked_output_data = model.predict([from_data, to_data, mask_data]) # Invoke the same data, but with a null mask (where no elements are masked). null_mask_data = np.ones((batch_size, 4, 2)) unmasked_output_data = model.predict( [from_data, to_data, null_mask_data]) # Because one data is masked and one is not, the outputs should not be the # same. self.assertNotAllClose(masked_output_data, unmasked_output_data) # Tests the layer with three inputs: Q, K, V. key = tf.keras.Input(shape=(2, 8)) output = test_layer(query, value=value, key=key, attention_mask=mask_tensor) model = tf.keras.Model([query, value, key, mask_tensor], output) masked_output_data = model.predict( [from_data, to_data, to_data, mask_data]) unmasked_output_data = model.predict( [from_data, to_data, to_data, null_mask_data]) # Because one data is masked and one is not, the outputs should not be the # same. self.assertNotAllClose(masked_output_data, unmasked_output_data) if use_bias: self.assertLen(test_layer._query_dense.trainable_variables, 2) self.assertLen(test_layer._output_dense.trainable_variables, 2) else: self.assertLen(test_layer._query_dense.trainable_variables, 1) self.assertLen(test_layer._output_dense.trainable_variables, 1)
def test_attention_scores(self): """Test attention outputs with coefficients.""" test_layer = attention.MultiHeadAttention(num_heads=12, key_size=64, return_attention_scores=True) # Create a 3-dimensional input (the first dimension is implicit). query = tf.keras.Input(shape=(40, 80)) output, coef = test_layer([query, query]) self.assertEqual(output.shape.as_list(), [None, 40, 80]) self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
def test_initializer(self): """Test with a specified initializer.""" test_layer = attention.MultiHeadAttention( num_heads=12, key_size=64, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)) # Create a 3-dimensional input (the first dimension is implicit). query = tf.keras.Input(shape=(40, 80)) output = test_layer([query, query]) self.assertEqual(output.shape.as_list(), [None, 40, 80])
def test_non_masked_attention(self, value_size, output_shape, output_dims): """Test that the attention layer can be created without a mask tensor.""" test_layer = attention.MultiHeadAttention(num_heads=12, key_size=64, value_size=value_size, output_shape=output_shape) # Create a 3-dimensional input (the first dimension is implicit). query = tf.keras.Input(shape=(40, 80)) value = tf.keras.Input(shape=(20, 80)) output = test_layer([query, value]) self.assertEqual(output.shape.as_list(), [None] + output_dims)
def build(self, input_shape): input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape input_tensor_shape = tf.TensorShape(input_tensor) if len(input_tensor_shape) != 3: raise ValueError("TransformerLayer expects a three-dimensional input of " "shape [batch, sequence, width].") batch_size, sequence_length, hidden_size = input_tensor_shape if len(input_shape) == 2: mask_tensor_shape = tf.TensorShape(input_shape[1]) expected_mask_tensor_shape = tf.TensorShape( [batch_size, sequence_length, sequence_length]) if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape): raise ValueError("When passing a mask tensor to TransformerLayer, the " "mask tensor must be of shape [batch, " "sequence_length, sequence_length] (here %s). Got a " "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)) if hidden_size % self._num_heads != 0: raise ValueError( "The input size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, self._num_heads)) self._attention_head_size = int(hidden_size // self._num_heads) common_kwargs = dict( kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint) self._attention_layer = attention.MultiHeadAttention( num_heads=self._num_heads, key_size=self._attention_head_size, dropout=self._attention_dropout_rate, <<<<<<< HEAD name="self_attention", **common_kwargs) # pylint: disable=protected-access # Temporarily handling for checkpoint compatible changes. self._attention_layer._build_from_signature( query=input_tensor_shape, value=input_tensor_shape) self._attention_output_dense = self._attention_layer._output_dense
def build(self, input_shape): input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape input_tensor_shape = tf.TensorShape(input_tensor) if len(input_tensor_shape) != 3: raise ValueError( "TransformerLayer expects a three-dimensional input of " "shape [batch, sequence, width].") batch_size, sequence_length, hidden_size = input_tensor_shape if len(input_shape) == 2: mask_tensor_shape = tf.TensorShape(input_shape[1]) expected_mask_tensor_shape = tf.TensorShape( [batch_size, sequence_length, sequence_length]) if not expected_mask_tensor_shape.is_compatible_with( mask_tensor_shape): raise ValueError( "When passing a mask tensor to TransformerLayer, the " "mask tensor must be of shape [batch, " "sequence_length, sequence_length] (here %s). Got a " "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)) if hidden_size % self._num_heads != 0: raise ValueError( "The input size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, self._num_heads)) self._attention_head_size = int(hidden_size // self._num_heads) self._attention_layer = attention.MultiHeadAttention( num_heads=self._num_heads, key_size=self._attention_head_size, dropout=self._attention_dropout_rate, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="self_attention") self._attention_dropout = tf.keras.layers.Dropout( rate=self._dropout_rate) if self._use_layer_norm: # Use float32 in layernorm for numeric stability. # It is probably safe in mixed_float16, but we haven't validated this yet. self._attention_layer_norm = (tf.keras.layers.LayerNormalization( name="self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)) self._intermediate_dense = dense_einsum.DenseEinsum( output_shape=self._intermediate_size, activation=None, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="intermediate") self._intermediate_activation_layer = tf.keras.layers.Activation( self._intermediate_activation) self._output_dense = dense_einsum.DenseEinsum( output_shape=hidden_size, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="output") self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) if self._use_layer_norm: # Use float32 in layernorm for numeric stability. self._output_layer_norm = tf.keras.layers.LayerNormalization( name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32) self._rezero_a = self.add_weight( name="rezero_alpha", initializer=tf.keras.initializers.Zeros(), trainable=True, dtype=tf.float32) super(ReZeroTransformer, self).build(input_shape)
def build(self, input_shape): input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape input_tensor_shape = tf.TensorShape(input_tensor) if len(input_tensor_shape) != 3: raise ValueError("TransformerLayer expects a three-dimensional input of " "shape [batch, sequence, width].") batch_size, sequence_length, hidden_size = input_tensor_shape if len(input_shape) == 2: mask_tensor_shape = tf.TensorShape(input_shape[1]) expected_mask_tensor_shape = tf.TensorShape( [batch_size, sequence_length, sequence_length]) if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape): raise ValueError("When passing a mask tensor to TransformerLayer, the " "mask tensor must be of shape [batch, " "sequence_length, sequence_length] (here %s). Got a " "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)) if hidden_size % self._num_heads != 0: raise ValueError( "The input size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, self._num_heads)) self._attention_head_size = int(hidden_size // self._num_heads) common_kwargs = dict( bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint) self._attention_layer = attention.MultiHeadAttention( num_heads=self._num_heads, key_size=self._attention_head_size, dropout=self._attention_dropout_rate, use_bias=self._use_bias, kernel_initializer=self._attention_initializer, name="self_attention", **common_kwargs) self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) # Use float32 in layernorm for numeric stability. # It is probably safe in mixed_float16, but we haven't validated this yet. self._attention_layer_norm = ( tf.keras.layers.LayerNormalization( name="self_attention_layer_norm", axis=-1, epsilon=self._norm_epsilon, dtype=tf.float32)) self._intermediate_dense = tf.keras.layers.experimental.EinsumDense( "abc,cd->abd", output_shape=(None, self._intermediate_size), bias_axes="d", kernel_initializer=self._kernel_initializer, name="intermediate", **common_kwargs) policy = tf.keras.mixed_precision.experimental.global_policy() if policy.name == "mixed_bfloat16": # bfloat16 causes BERT with the LAMB optimizer to not converge # as well, so we use float32. # TODO(b/154538392): Investigate this. policy = tf.float32 self._intermediate_activation_layer = tf.keras.layers.Activation( self._intermediate_activation, dtype=policy) self._intermediate_dropout_layer = tf.keras.layers.Dropout( rate=self._intermediate_dropout) self._output_dense = tf.keras.layers.experimental.EinsumDense( "abc,cd->abd", output_shape=(None, hidden_size), bias_axes="d", name="output", kernel_initializer=self._kernel_initializer, **common_kwargs) self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) # Use float32 in layernorm for numeric stability. self._output_layer_norm = tf.keras.layers.LayerNormalization( name="output_layer_norm", axis=-1, epsilon=self._norm_epsilon, dtype=tf.float32) super(Transformer, self).build(input_shape)
def build(self, input_shape): # 输入 + 遮挡 input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape input_tensor_shape = tf.TensorShape(input_tensor) if len(input_tensor_shape) != 3: raise ValueError( "TransformerLayer expects a three-dimensional input of " "shape [batch, sequence, width].") batch_size, sequence_length, hidden_size = input_tensor_shape if len(input_shape) == 2: mask_tensor_shape = tf.TensorShape(input_shape[1]) expected_mask_tensor_shape = tf.TensorShape( [batch_size, sequence_length, sequence_length]) if not expected_mask_tensor_shape.is_compatible_with( mask_tensor_shape): raise ValueError( "When passing a mask tensor to TransformerLayer, the " "mask tensor must be of shape [batch, " "sequence_length, sequence_length] (here %s). Got a " "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)) # 多头注意力 if hidden_size % self._num_heads != 0: raise ValueError( "The input size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, self._num_heads)) self._attention_head_size = int(hidden_size // self._num_heads) # 注意力层 self._attention_layer = attention.MultiHeadAttention( num_heads=self._num_heads, key_size=self._attention_head_size, dropout=self._attention_dropout_rate, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="self_attention") # pylint: disable=protected-access # 注意力层,手动创建,确定输入尺寸 self._attention_layer.build([input_tensor_shape] * 3) self._attention_output_dense = self._attention_layer._output_dense # pylint: enable=protected-access self._attention_dropout = tf.keras.layers.Dropout( rate=self._dropout_rate) # Use float32 in layernorm for numeric stability. # It is probably safe in mixed_float16, but we haven't validated this yet. self._attention_layer_norm = (tf.keras.layers.LayerNormalization( name="self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)) # 中间层 self._intermediate_dense = dense_einsum.DenseEinsum( output_shape=self._intermediate_size, activation=None, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="intermediate") policy = tf.keras.mixed_precision.experimental.global_policy() if policy.name == "mixed_bfloat16": # bfloat16 causes BERT with the LAMB optimizer to not converge # as well, so we use float32. # TODO(b/154538392): Investigate this. policy = tf.float32 # 中间层激活函数 self._intermediate_activation_layer = tf.keras.layers.Activation( self._intermediate_activation, dtype=policy) # 输出的线性层 self._output_dense = dense_einsum.DenseEinsum( output_shape=hidden_size, kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="output") self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate) # Use float32 in layernorm for numeric stability. self._output_layer_norm = tf.keras.layers.LayerNormalization( name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32) super(Transformer, self).build(input_shape)