def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, dont_minus_one=True, shared_layers=None, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() # build all (num_layers) layers if shared_layers is not None: #self.layers = shared_layers self.layers = nn.ModuleList([layer for layer in shared_layers]) self.layers.append( TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout)) else: self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for _ in range(num_layers if dont_minus_one else num_layers - 1) ]) self.top_off = False if (dont_minus_one and 'multi_encoder' in locals() and multi_encoder) else True self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, active_layers: list = [], emb_dropout: float = 0.1, freeze: bool = False, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super().__init__() # build all (num_layers) layers or only some if active_layers != []: self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for x in active_layers]) else: self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for _ in range(num_layers)]) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, attn_func: str = "softmax", attn_alpha: float = 1.5, pe: bool = True, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, attn_func=attn_func, attn_alpha=attn_alpha) for _ in range(num_layers) ]) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) if pe else None self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, freeze: bool = False, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() # build all (num_layers) layers layers = [] for _ in range(num_layers): layer = TransformerEncoderLayer( hidden_size, MultiHeadedAttention(num_heads, hidden_size, dropout), PositionwiseFeedForward(hidden_size, ff_size, dropout), dropout) layers.append(layer) self.layers = nn.ModuleList(layers) self.norm = nn.LayerNorm(hidden_size) self.pe = PositionalEncoding(hidden_size, dropout=dropout) self._output_size = hidden_size if freeze: freeze_params(self)