def _build_attention(self, qkv_rank): """Builds multi-head dot-product attention computations. This function builds attributes necessary for `_compute_attention` to costomize attention computation to replace the default dot-product attention. Args: qkv_rank: the rank of query, key, value tensors. """ # qkv_rank = 4 # 进行注意力的维度 (1,) if self._attention_axes is None: self._attention_axes = tuple(range(1, qkv_rank - 2)) else: self._attention_axes = tuple(self._attention_axes) # 创建注意力权重计算公式 self._dot_product_equation, self._combine_equation, attn_scores_rank = ( _build_attention_equation(qkv_rank, attn_axes=self._attention_axes)) norm_axes = tuple( range(attn_scores_rank - len(self._attention_axes), attn_scores_rank)) # 注意力权重,加上遮挡,计算注意力分布 self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[1], normalization_axes=norm_axes) self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
def __init__(self, num_heads, key_size, value_size=None, dropout_rate=0.0, use_bias=True, output_shape=None, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self._num_heads = num_heads self._key_size = key_size self._value_size = value_size if value_size else key_size self._dropout_rate = dropout_rate self._use_bias = use_bias self._output_shape = output_shape self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[1]) self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
def __init__(self, num_heads, head_size, dropout_rate=0.0, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self._num_heads = num_heads self._head_size = head_size self._dropout_rate = dropout_rate self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._query_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._head_size), kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="query") self._key_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._head_size), kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="key") self._value_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._head_size), kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="value") self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1]) self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
def test_serialize_deserialize(self): test_layer = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1], normalization_axes=[6, 7]) new_layer = masked_softmax.MaskedSoftmax.from_config( test_layer.get_config()) # If the serialization was successful, the new config should match the old. self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
def test_masked_softmax_with_none_mask(self): test_layer = masked_softmax.MaskedSoftmax() input_tensor = tf.keras.Input(shape=(4, 8)) output = test_layer(input_tensor, None) model = tf.keras.Model(input_tensor, output) input_data = 10 * np.random.random_sample((3, 4, 8)) output_data = model.predict(input_data) expected_data = tf.nn.softmax(input_data) self.assertAllClose(expected_data, output_data)
def test_masked_softmax(self): test_layer = masked_softmax.MaskedSoftmax() input_tensor = tf.keras.Input(shape=(4, 8)) mask_tensor = tf.keras.Input(shape=(4, 8)) output = test_layer(input_tensor, mask_tensor) model = tf.keras.Model([input_tensor, mask_tensor], output) input_data = 10 * np.random.random_sample((3, 4, 8)) mask_data = np.random.randint(2, size=(3, 4, 8)) output_data = model.predict([input_data, mask_data]) expected_zeros = np.greater(mask_data, 0) is_zeros = np.greater(output_data, 0) self.assertAllEqual(expected_zeros, is_zeros)
def test_softmax_with_axes_expansion(self): test_layer = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1]) input_tensor = tf.keras.Input(shape=(4, 8)) mask_tensor = tf.keras.Input(shape=(8)) output = test_layer(input_tensor, mask_tensor) model = tf.keras.Model([input_tensor, mask_tensor], output) input_data = 10 * np.random.random_sample((3, 4, 8)) mask_data = np.random.randint(2, size=(3, 8)) output_data = model.predict([input_data, mask_data]) expanded_mask = np.expand_dims(mask_data, axis=1) * np.ones_like(input_data) expected_zeros = np.greater(expanded_mask, 0) is_zeros = np.greater(output_data, 0) self.assertAllEqual(expected_zeros, is_zeros)
def test_masked_softmax_high_dims(self): test_layer = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1], normalization_axes=[6, 7]) input_shape = [2, 3, 4, 5, 6, 7, 8] mask_shape = [5, 6, 7, 8] input_tensor = tf.keras.Input(shape=input_shape) mask_tensor = tf.keras.Input(shape=mask_shape) output = test_layer(input_tensor, mask_tensor) model = tf.keras.Model([input_tensor, mask_tensor], output) input_data = 10 * np.random.random_sample([3] + input_shape) mask_data = np.random.randint(2, size=[3] + mask_shape) output_data = model.predict([input_data, mask_data]) expanded_mask = np.expand_dims(mask_data, axis=1) expanded_mask = np.expand_dims(expanded_mask, axis=1) expanded_mask = np.expand_dims(expanded_mask, axis=1) * np.ones_like(input_data) expected_zeros = np.greater(expanded_mask, 0) is_zeros = np.greater(output_data, 0) self.assertAllEqual(expected_zeros, is_zeros)
def _build_attention(self, rank): super()._build_attention(rank) # pytype: disable=attribute-error # typed-keras self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[2])
def _build_attention(self, rank): self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[1], normalization_axes=[2]) self._dropout_layer = tf.keras.layers.Dropout(rate=self._dropout)
def build_attention(self, rank): super(MultiChannelAttention, self).build_attention(rank) self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[2])
def build(self, input_shape): super(MultiChannelAttention, self).build(input_shape) self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[2])