def __init__(self, num_heads, num_units, dropout=0.1, return_attention=False, **kwargs): """Initializes this layer. Args: num_heads: The number of attention heads. num_units: The number of hidden units. dropout: The probability to drop units from the inputs. return_attention: If ``True``, also return the attention weights of the first head. kwargs: Additional layer arguments. """ super(MultiHeadAttention, self).__init__(**kwargs) if num_units % num_heads != 0: raise ValueError("Multi head attention requires that num_units is a" " multiple of %s" % num_heads) self.num_heads = num_heads self.num_units = num_units self.linear_queries = common.Dense(num_units) self.linear_keys = common.Dense(num_units) self.linear_values = common.Dense(num_units) self.linear_output = common.Dense(num_units) self.dropout = dropout self.return_attention = return_attention
def __init__(self, num_heads, num_units, dropout=0.1, return_attention=False, maximum_relative_position=None, **kwargs): """Initializes this layer. Args: num_heads: The number of attention heads. num_units: The number of hidden units. dropout: The probability to drop units from the inputs. return_attention: If ``True``, also return the attention weights of the first head. maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). kwargs: Additional layer arguments. """ super(MultiHeadAttention, self).__init__(**kwargs) if num_units % num_heads != 0: raise ValueError( "Multi head attention requires that num_units is a" " multiple of %s" % num_heads) self.num_heads = num_heads self.num_units_per_head = num_units // num_heads self.linear_queries = common.Dense(num_units) self.linear_keys = common.Dense(num_units) self.linear_values = common.Dense(num_units) self.linear_output = common.Dense(num_units) self.dropout = dropout self.return_attention = return_attention self.maximum_relative_position = maximum_relative_position
def testDense(self, weight_shape, input_shape, transpose): weight = tf.zeros(weight_shape) layer = common.Dense(10, weight=weight, transpose=transpose) x = tf.ones(input_shape) y = layer(x) self.assertEqual(layer.kernel.experimental_ref(), weight.experimental_ref()) self.assertEqual(self.evaluate(tf.reduce_sum(y)), 0)
def __init__(self, num_heads, num_units, dropout=0.1, return_attention=False, maximum_relative_position=None, attention_span=None, num_attended_heads=1, **kwargs): """Initializes this layer. Args: num_heads: The number of attention heads. num_units: The number of hidden units. dropout: The probability to drop units from the inputs. return_attention: If ``True``, also return the attention weights of the first head. maximum_relative_position: Maximum relative position representation (from https://arxiv.org/abs/1803.02155). attention_span: Maximum relative position to attend to (from https://arxiv.org/abs/1904.03107). num_attended_heads: How many heads should be attended. Defaults to 1 as each head only attends to itself in vanilla Transformer. Increase to an odd number < `num_heads` to also model head interaction. (from ttps://arxiv.org/abs/1904.03107). kwargs: Additional layer arguments. """ super(MultiHeadAttention, self).__init__(**kwargs) if num_units % num_heads != 0: raise ValueError( "Multi head attention requires that num_units is a" " multiple of %s" % num_heads) self.num_heads = num_heads self.num_units_per_head = num_units // num_heads self.linear_queries = common.Dense(num_units) self.linear_keys = common.Dense(num_units) self.linear_values = common.Dense(num_units) self.linear_output = common.Dense(num_units) self.dropout = dropout self.return_attention = return_attention self.maximum_relative_position = maximum_relative_position self.attention_span = attention_span if num_attended_heads % 2 == 0: raise ValueError( "Number heads attended must be odd to guarantee symmetry.") self.num_attended_heads = num_attended_heads
def __init__( self, inner_dim, output_dim, dropout=0.1, activation=tf.nn.relu, **kwargs ): """Initializes this layer. Args: inner_dim: The number of units of the inner linear transformation. output_dim: The number of units of the ouput linear transformation. dropout: The probability to drop units from the activation output. activation: The activation function to apply between the two linear transformations. kwargs: Additional layer arguments. """ super().__init__(**kwargs) self.inner = common.Dense(inner_dim, activation=activation) self.outer = common.Dense(output_dim) self.dropout = dropout
def _add_attention(cell): # Produce Luong-style attentional hidden states. attention_layer = common.Dense( cell.output_size, use_bias=False, activation=attention_layer_activation ) wrapper = tfa.seq2seq.AttentionWrapper( cell, self.attention_mechanism, attention_layer=attention_layer ) return wrapper
def initialize(self, vocab_size=None, output_layer=None): """Initializes the decoder configuration. Args: vocab_size: The target vocabulary size. output_layer: The output layer to use. Raises: ValueError: if both :obj:`vocab_size` and :obj:`output_layer` are not set. """ if output_layer is not None: self.output_layer = output_layer else: if vocab_size is None: raise ValueError( "One of vocab_size and output_layer must be set") self.output_layer = common.Dense(vocab_size)