Esempio n. 1
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 dont_minus_one=True,
                 shared_layers=None,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(TransformerEncoder, self).__init__()

        # build all (num_layers) layers
        if shared_layers is not None:
            #self.layers = shared_layers
            self.layers = nn.ModuleList([layer for layer in shared_layers])
            self.layers.append(
                TransformerEncoderLayer(size=hidden_size,
                                        ff_size=ff_size,
                                        num_heads=num_heads,
                                        dropout=dropout))
        else:
            self.layers = nn.ModuleList([
                TransformerEncoderLayer(size=hidden_size,
                                        ff_size=ff_size,
                                        num_heads=num_heads,
                                        dropout=dropout)
                for _ in range(num_layers if dont_minus_one else num_layers -
                               1)
            ])
        self.top_off = False if (dont_minus_one
                                 and 'multi_encoder' in locals()
                                 and multi_encoder) else True
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.pe = PositionalEncoding(hidden_size)
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Esempio n. 2
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 active_layers: list = [],
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super().__init__()

        # build all (num_layers) layers or only some
        if active_layers != []:
            self.layers = nn.ModuleList([
                TransformerEncoderLayer(size=hidden_size, ff_size=ff_size,
                                        num_heads=num_heads, dropout=dropout)
                for x in active_layers])
        else:
            self.layers = nn.ModuleList([
                TransformerEncoderLayer(size=hidden_size, ff_size=ff_size,
                                        num_heads=num_heads, dropout=dropout)
                for _ in range(num_layers)])
            
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.pe = PositionalEncoding(hidden_size)
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Esempio n. 3
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 emb_dropout: float = 0.1,
                 freeze: bool = False,
                 attn_func: str = "softmax",
                 attn_alpha: float = 1.5,
                 pe: bool = True,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability for Transformer layers
        :param emb_dropout: Is applied to the input (word embeddings).
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(TransformerEncoder, self).__init__()

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(size=hidden_size,
                                    ff_size=ff_size,
                                    num_heads=num_heads,
                                    dropout=dropout,
                                    attn_func=attn_func,
                                    attn_alpha=attn_alpha)
            for _ in range(num_layers)
        ])

        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.pe = PositionalEncoding(hidden_size) if pe else None
        self.emb_dropout = nn.Dropout(p=emb_dropout)

        self._output_size = hidden_size

        if freeze:
            freeze_params(self)
Esempio n. 4
0
    def __init__(self,
                 hidden_size: int = 512,
                 ff_size: int = 2048,
                 num_layers: int = 8,
                 num_heads: int = 4,
                 dropout: float = 0.1,
                 freeze: bool = False,
                 **kwargs):
        """
        Initializes the Transformer.
        :param hidden_size: hidden size and size of embeddings
        :param ff_size: position-wise feed-forward layer size.
          (Typically this is 2*hidden_size.)
        :param num_layers: number of layers
        :param num_heads: number of heads for multi-headed attention
        :param dropout: dropout probability
        :param freeze: freeze the parameters of the encoder during training
        :param kwargs:
        """
        super(TransformerEncoder, self).__init__()

        # build all (num_layers) layers
        layers = []
        for _ in range(num_layers):
            layer = TransformerEncoderLayer(
                hidden_size,
                MultiHeadedAttention(num_heads, hidden_size, dropout),
                PositionwiseFeedForward(hidden_size, ff_size, dropout),
                dropout)
            layers.append(layer)

        self.layers = nn.ModuleList(layers)
        self.norm = nn.LayerNorm(hidden_size)
        self.pe = PositionalEncoding(hidden_size, dropout=dropout)
        self._output_size = hidden_size

        if freeze:
            freeze_params(self)