def __init__(self, n_heads, filter_size, hidden_size, dropout=None) -> None: super().__init__() self.self_norm = LayerNorm() self.self_attention = MultiHeadAttention(n_heads) self.norm_target = LayerNorm() self.feed_forward = TransformerFeedForward(filter_size, hidden_size, dropout)
def __init__(self, filter_size, hidden_size, dropout) -> None: super(TransformerFeedForward, self).__init__() self.norm = LayerNorm() self.feed_forward = DenseStack([filter_size, hidden_size], output_activation=None) self.dropout = Dropout(0 if dropout is None else dropout)