def __init__(self, num_heads, key_size, value_size=None, dropout_rate=0.0, use_bias=True, output_shape=None, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self._num_heads = num_heads self._key_size = key_size self._value_size = value_size if value_size else key_size self._dropout_rate = dropout_rate self._use_bias = use_bias self._output_shape = output_shape self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[1]) self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
def test_masked_softmax_with_none_mask(self): test_layer = masked_softmax.MaskedSoftmax() input_tensor = tf.keras.Input(shape=(4, 8)) output = test_layer([input_tensor, None]) model = tf.keras.Model(input_tensor, output) input_data = 10 * np.random.random_sample((3, 4, 8)) output_data = model.predict(input_data) expected_data = tf.nn.softmax(input_data) self.assertAllClose(expected_data, output_data)
def test_masked_softmax(self): test_layer = masked_softmax.MaskedSoftmax() input_tensor = tf.keras.Input(shape=(4, 8)) mask_tensor = tf.keras.Input(shape=(4, 8)) output = test_layer([input_tensor, mask_tensor]) model = tf.keras.Model([input_tensor, mask_tensor], output) input_data = 10 * np.random.random_sample((3, 4, 8)) mask_data = np.random.randint(2, size=(3, 4, 8)) output_data = model.predict([input_data, mask_data]) expected_zeros = np.greater(mask_data, 0) is_zeros = np.greater(output_data, 0) self.assertAllEqual(expected_zeros, is_zeros)
def test_softmax_with_axes_expansion(self): test_layer = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1]) input_tensor = tf.keras.Input(shape=(4, 8)) mask_tensor = tf.keras.Input(shape=(8)) output = test_layer([input_tensor, mask_tensor]) model = tf.keras.Model([input_tensor, mask_tensor], output) input_data = 10 * np.random.random_sample((3, 4, 8)) mask_data = np.random.randint(2, size=(3, 8)) output_data = model.predict([input_data, mask_data]) expanded_mask = np.expand_dims(mask_data, axis=1) * np.ones_like(input_data) expected_zeros = np.greater(expanded_mask, 0) is_zeros = np.greater(output_data, 0) self.assertAllEqual(expected_zeros, is_zeros)
def __init__(self, num_heads, key_size, dropout_rate=0.0, output_shape=None, kernel_initializer="glorot_uniform", bias_initializer="zeros", kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): super(TalkingHeadsAttention, self).__init__(**kwargs) self._num_heads = num_heads self._key_size = key_size self._dropout_rate = dropout_rate self._output_shape = output_shape self._kernel_initializer = tf.keras.initializers.get( kernel_initializer) self._bias_initializer = tf.keras.initializers.get(bias_initializer) self._kernel_regularizer = tf.keras.regularizers.get( kernel_regularizer) self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) self._bias_constraint = tf.keras.constraints.get(bias_constraint) self._query_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._key_size), kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="query") self._key_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._key_size), kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="key") self._value_dense = dense_einsum.DenseEinsum( output_shape=(self._num_heads, self._key_size), kernel_initializer=self._kernel_initializer, bias_initializer=self._bias_initializer, kernel_regularizer=self._kernel_regularizer, bias_regularizer=self._bias_regularizer, activity_regularizer=self._activity_regularizer, kernel_constraint=self._kernel_constraint, bias_constraint=self._bias_constraint, name="value") self._masked_softmax = masked_softmax.MaskedSoftmax( mask_expansion_axes=[1]) self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)