def __init__(self,
                 n_heads,
                 filter_size,
                 hidden_size,
                 dropout=None) -> None:
        super().__init__()
        self.self_norm = LayerNorm()
        self.self_attention = MultiHeadAttention(n_heads)

        self.norm_target = LayerNorm()

        self.feed_forward = TransformerFeedForward(filter_size, hidden_size,
                                                   dropout)
 def __init__(self, filter_size,
              hidden_size,
              dropout) -> None:
     super(TransformerFeedForward, self).__init__()
     self.norm = LayerNorm()
     self.feed_forward = DenseStack([filter_size, hidden_size], output_activation=None)
     self.dropout = Dropout(0 if dropout is None else dropout)