def __init__(self, model_dimension, dropout_percentage, number_of_heads, feedforward_dimension, number_of_layers): """ Create DecoderLayer copy number_of_layers times. Args: model_dimension: model dimension generally same as embedding_dimension of VocabEmbedding or FeatureEmbedding dropout_percentage: droupout percentage for residual connection number_of_heads: number of heads for multiheaded attention feedforward_dimension: units of feedforward layer. Generally 2048 units number_of_layers: Number of encoder layers """ super(Decoder, self).__init__() self.dec_layers = clone(DecoderLayer(model_dimension, dropout_percentage, number_of_heads, feedforward_dimension), number_of_layers)
def __init__(self, model_dimension, dropout_percentage, number_of_heads, feedforward_dimension): """ Creates 2 copies of ResidualConnection, multiheaded attention with number_of_heads heads and fully-connected layer of shape (model_dimension, feedforward_dimension) Args: model_dimension: model dimension generally same as embedding_dimension of VocabEmbedding or FeatureEmbedding dropout_percentage: droupout percentage for residual connection number_of_heads: number of heads for multiheaded attention feedforward_dimension: units of feedforward layer. Generally 2048 units """ super(EncoderLayer, self).__init__() self.res_layers = clone( ResidualConnection(model_dimension, dropout_percentage), 2) self.self_att = MultiheadedAttention(model_dimension, number_of_heads) self.feed_forward = PositionwiseFeedForward(model_dimension, feedforward_dimension)
def __init__(self, model_dimension, number_of_heads): """ Creates 4 copies of linear layer for Query, Key, Value and attention connections. It will help attention to have multiple “representation subspaces” to focus on. This number is equivalent to number_of_heads. 4 Linear layers are used as weights for Query, Key, Value and multihead concatinated attention. For reducing computational cost, weights (from linear layers) are shared between all the heads. Args: model_dimension: model dimension number_of_heads: number of heads in mutliheaded attention """ super(MultiheadedAttention, self).__init__() assert model_dimension % number_of_heads == 0 self.model_dimension = model_dimension self.number_of_heads = number_of_heads self.d_k = model_dimension // number_of_heads self.linears = clone(nn.Linear(model_dimension, model_dimension), 4) # bias True??