def __init__(self,
                 num_units,
                 num_heads,
                 ffn_inner_dim,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 **kwargs):
        """Initializes the layer.

    Args:
      num_units: The number of hidden units.
      num_heads: The number of heads in the multi-head attention.
      ffn_inner_dim: The number of units of the inner linear transformation
        in the feed forward layer.
      dropout: The probability to drop units from the outputs.
      attention_dropout: The probability to drop units from the attention.
      relu_dropout: The probability to drop units from the ReLU activation in
        the feed forward layer.
      kwargs: Additional layer arguments.
    """
        super(_SelfAttentionEncoderLayer, self).__init__(**kwargs)
        self.self_attention = transformer.MultiHeadAttention(
            num_heads, num_units, dropout=attention_dropout)
        self.self_attention = common.LayerWrapper(self.self_attention,
                                                  normalize_input=True,
                                                  output_dropout=dropout,
                                                  residual_connection=True)
        self.ffn = transformer.FeedForwardNetwork(ffn_inner_dim,
                                                  num_units,
                                                  dropout=relu_dropout)
        self.ffn = common.LayerWrapper(self.ffn,
                                       normalize_input=True,
                                       output_dropout=dropout,
                                       residual_connection=True)
Esempio n. 2
0
    def __init__(self,
                 num_units,
                 num_heads,
                 ffn_inner_dim,
                 num_sources=1,
                 dropout=0.1,
                 attention_dropout=0.1,
                 ffn_dropout=0.1,
                 ffn_activation=tf.nn.relu,
                 **kwargs):
        """Initializes the layer.

    Args:
      num_units: The number of hidden units.
      num_heads: The number of heads in the multi-head attention.
      ffn_inner_dim: The number of units of the inner linear transformation
        in the feed forward layer.
      num_sources: The number of source contexts.
      dropout: The probability to drop units from the outputs.
      attention_dropout: The probability to drop units from the attention.
      ffn_dropout: The probability to drop units from the activation output in
        the feed forward layer.
      ffn_activation: The activation function to apply between the two linear
        transformations of the feed forward layer.
      **kwargs: Additional layer arguments.
    """
        super(_SelfAttentionDecoderLayer, self).__init__(**kwargs)
        self.self_attention = transformer.MultiHeadAttention(
            num_heads,
            num_units,
            dropout=attention_dropout,
            name="masked_multi_head_attention")
        self.self_attention = transformer.TransformerLayerWrapper(
            self.self_attention, dropout, name="sub_layer_0")
        self.attention = []
        for i in range(num_sources):
            attention = transformer.MultiHeadAttention(
                num_heads,
                num_units,
                dropout=attention_dropout,
                return_attention=num_sources == 1,
                name="multi_head_attention")
            attention = transformer.TransformerLayerWrapper(
                attention, dropout, name="sub_layer_%d" % (i + 1))
            self.attention.append(attention)
        self.ffn = transformer.FeedForwardNetwork(ffn_inner_dim,
                                                  num_units,
                                                  dropout=ffn_dropout,
                                                  activation=ffn_activation,
                                                  name="feed_forward")
        self.ffn = transformer.TransformerLayerWrapper(self.ffn,
                                                       dropout,
                                                       name="sub_layer_%d" %
                                                       (num_sources + 1))
Esempio n. 3
0
 def testFeedForwardNetwork(self):
     ffn = transformer.FeedForwardNetwork(20, 10)
     x = tf.random.uniform([4, 5, 10])
     y = ffn(x)
     self.assertEqual(y.shape, x.shape)