def __init__(self, d_model, heads, dropout=0.1): super(EncoderLayer, self).__init__() self.norm = Norm(d_model) self.dropout = nn.Dropout(dropout) self.attention_layer = MultiHeadedSelfAttention(heads, d_model, dropout=dropout) self.ffnn_layer = FeedForward(d_model, dropout=dropout)
def __init__(self, d_model, heads, dropout=0.1): super().__init__() self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) self.ff = FeedForward(d_model, dropout=dropout) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout)
def __init__(self, d_model, heads, dropout=0.1): super(DecoderLayer, self).__init__() self.norm = Norm(d_model) self.dropout = nn.Dropout(dropout) # in the decoder, the self-attention layer is only allowed to attend to earlier positions in the output sequence # this is different than the encoder counterparts self.attention_layer = MultiHeadedSelfAttention(heads, d_model, dropout=dropout) self.encoder_decoder_attention_layer = MultiHeadedSelfAttention(heads, d_model, dropout=dropout) self.ffnn_layer = FeedForward(d_model, dropout=dropout)
def __init__(self, d_model, heads, decoder_extra_layers, dropout=0.1): super().__init__() self.decoder_extra_layers = decoder_extra_layers self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.norm_3 = Norm(d_model) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) self.dropout_3 = nn.Dropout(dropout) self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) self.ff = FeedForward(d_model, dropout=dropout)
def __init__(self, d_model, dropout=0.1): super().__init__() self.ff = FeedForward(d_model, dropout=dropout)