def __init__(self, hidden_size: int = 512, emb_size: int = 512, kernel_size: int = 5, num_layers: int = 15, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, **kwargs): """ Initializes the ConvSeq2Seq Encoder. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(ConvSeq2SeqEncoder, self).__init__() # build all (num_layers) layers self.layers = nn.ModuleList([ ConvSeq2SeqEncoderLayer(hidden_size=hidden_size, kernel_size=kernel_size, dropout=dropout) for _ in range(num_layers) ]) self.absPE = AbsolutePositionalEncoding(emb_size) self.emb2hidden = nn.Linear(emb_size, hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) if freeze: freeze_params(self)
def __init__(self, embedding_dim: int = 64, scale: bool = False, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = False, **kwargs): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param scale: :param vocab_size: :param padding_idx: :param freeze: freeze the embeddings during training """ super(Embeddings, self).__init__() self.embedding_dim = embedding_dim self.scale = scale self.vocab_size = vocab_size self.lut = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=padding_idx) if freeze: freeze_params(self)
def __init__(self, num_layers: int = 4, num_heads: int = 8, hidden_size: int = 512, ff_size: int = 2048, dropout: float = 0.1, emb_dropout: float = 0.1, vocab_size: int = 1, freeze: bool = False, self_attn_func: str = "softmax", src_attn_func: str = "softmax", self_attn_alpha: float = 1.5, src_attn_alpha: float = 1.5, gen_func: str = "softmax", gen_alpha: float = 1.5, output_bias: bool = False, **kwargs): """ Initialize a Transformer decoder. :param num_layers: number of Transformer layers :param num_heads: number of heads for each layer :param hidden_size: hidden size :param ff_size: position-wise feed-forward size :param dropout: dropout probability (1-keep) :param emb_dropout: dropout probability for embeddings :param vocab_size: size of the output vocabulary :param freeze: set to True keep all decoder parameters fixed :param kwargs: """ super(TransformerDecoder, self).__init__(hidden_size, vocab_size, emb_dropout, gen_func=gen_func, gen_alpha=gen_alpha, output_bias=output_bias) # create num_layers decoder layers and put them in a list self.layers = nn.ModuleList([ TransformerDecoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, self_attn_func=self_attn_func, self_attn_alpha=self_attn_alpha, src_attn_func=src_attn_func, src_attn_alpha=src_attn_alpha) for _ in range(num_layers) ]) self.pe = PositionalEncoding(hidden_size) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, dont_minus_one=True, shared_layers=None, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() # build all (num_layers) layers if shared_layers is not None: #self.layers = shared_layers self.layers = nn.ModuleList([layer for layer in shared_layers]) self.layers.append( TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout)) else: self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for _ in range(num_layers if dont_minus_one else num_layers - 1) ]) self.top_off = False if (dont_minus_one and 'multi_encoder' in locals() and multi_encoder) else True self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__( self, num_layers: int = 4, num_heads: int = 8, hidden_size: int = 512, ff_size: int = 2048, dropout: float = 0.1, freeze: bool = False, self_attn_func: str = "softmax", src_attn_func: str = "softmax", self_attn_alpha: float = 1.5, src_attn_alpha: float = 1.5, merge: str = "serial", # for multi-encoder models gate_func: str = "softmax", gate_alpha: float = 1.5, **kwargs): """ Initialize a Transformer decoder. :param num_layers: :param num_heads: :param hidden_size: hidden size :param ff_size: position-wise feed-forward size :param dropout: :param emb_dropout: dropout probability for embeddings :param freeze: set to True keep all decoder parameters fixed :param kwargs: passed to generic Decoder Constructor """ super(TransformerDecoder, self).__init__(hidden_size, **kwargs) self.layers = nn.ModuleList([ self.layer_module(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, self_attn_func=self_attn_func, self_attn_alpha=self_attn_alpha, src_attn_func=src_attn_func, src_attn_alpha=src_attn_alpha, merge=merge, gate_func=gate_func, gate_alpha=gate_alpha) for _ in range(num_layers) ]) self.pe = PositionalEncoding(hidden_size) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) if freeze: freeze_params(self)
def __init__(self, embed_file: str, vocab: Vocabulary, embedding_dim: int = 64, scale: bool = False, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = True, **kwargs): super(PretrainedEmbeddings, self).__init__(embedding_dim, scale, vocab_size, padding_idx, freeze, **kwargs) # overwrite lut with embeddings from embed_file self.load_embeddings_from_file(embed_file, vocab) if freeze: freeze_params(self)
def __init__(self, num_layers: int = 4, num_heads: int = 8, hidden_size: int = 512, ff_size: int = 2048, dropout: float = 0.1, emb_dropout: float = 0.1, vocab_size: int = 1, freeze: bool = False, **kwargs): """ Initialize a Transformer decoder. :param num_layers: number of Transformer layers :param num_heads: number of heads for each layer :param hidden_size: hidden size :param ff_size: position-wise feed-forward size :param dropout: dropout probability (1-keep) :param emb_dropout: dropout probability for embeddings :param vocab_size: size of the output vocabulary :param freeze: set to True keep all decoder parameters fixed :param kwargs: """ super().__init__() self._hidden_size = hidden_size self._output_size = vocab_size # create num_layers decoder layers and put them in a list self.layers = nn.ModuleList([ TransformerDecoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for _ in range(num_layers) ]) self.pe = PositionalEncoding(hidden_size) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.emb_dropout = nn.Dropout(p=emb_dropout) self.output_layer = nn.Linear(hidden_size, hidden_size, bias=False) #self.output_layer = self.layers[-1].feed_forward.pwff_layer[-2] if freeze: freeze_params(self)
def __init__(self, num_layers: int = 4, num_heads: int = 8, hidden_size: int = 512, ff_size: int = 2048, dropout: float = 0.1, vocab_size: int = 1, freeze: bool = False, **kwargs): """ Initialize a Transformer decoder. :param num_layers: number of Transformer layers :param num_heads: number of heads for each layer :param hidden_size: hidden size :param ff_size: position-wise feed-forward size :param dropout: dropout probability (1-keep) :param vocab_size: size of the output vocabulary :param freeze: set to True keep all decoder parameters fixed :param kwargs: """ super(TransformerDecoder, self).__init__() # build all (num_layers) layers layers = [] for _ in range(num_layers): layer = TransformerDecoderLayer( hidden_size, MultiHeadedAttention(num_heads, hidden_size, dropout), MultiHeadedAttention(num_heads, hidden_size, dropout), PositionwiseFeedForward(hidden_size, ff_size, dropout), dropout) layers.append(layer) self.layers = nn.ModuleList(layers) self.norm = nn.LayerNorm(hidden_size) self.pe = PositionalEncoding(hidden_size, dropout=dropout) self._hidden_size = hidden_size self._output_size = vocab_size self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) if freeze: freeze_params(self)
def __init__(self, num_layers: int = 4, hidden_size: int = 512, emb_size: int = 512, kernel_size: int = 5, dropout: float = 0.1, emb_dropout: float = 0.1, vocab_size: int = 1, freeze: bool = False, use_multi_head: bool = False, num_heads: int = 8, **kwargs): """ Initialize a ConvSeq2Seq decoder. :param num_layers: number of Transformer layers :param hidden_size: hidden size :param dropout: dropout probability (1-keep) :param emb_dropout: dropout probability for embeddings :param vocab_size: size of the output vocabulary :param freeze: set to True keep all decoder parameters fixed :param kwargs: """ super(ConvSeq2SeqDecoder, self).__init__() self._hidden_size = hidden_size self._output_size = vocab_size # create num_layers decoder layers and put them in a list self.layers = nn.ModuleList([ConvSeq2SeqDecoderLayer( hidden_size=hidden_size, embedding_size=emb_size, kernel_size=kernel_size, use_multi_head=use_multi_head, num_heads=num_heads, dropout=dropout) for _ in range(num_layers)]) self.absPE = AbsolutePositionalEncoding(emb_size) self.emb2hidden = nn.Linear(emb_size, hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", hidden_size: int = 1, emb_size: int = 1, num_layers: int = 1, dropout: float = 0., emb_dropout: float = 0., bidirectional: bool = True, freeze: bool = False, enforce_sorted: bool = True, **kwargs) -> None: """ Create a new recurrent encoder. :param rnn_type: RNN type: `gru` or `lstm`. :param hidden_size: Size of each RNN. :param emb_size: Size of the word embeddings. :param num_layers: Number of encoder RNN layers. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param bidirectional: Use a bi-directional RNN. :param freeze: freeze the parameters of the encoder during training :param kwargs: """ self._enforce_sorted = enforce_sorted super(RecurrentEncoder, self).__init__() self.emb_dropout = nn.Dropout(p=emb_dropout, inplace=False) self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.rnn = rnn(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0.) self._output_size = 2 * hidden_size if bidirectional else hidden_size if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, active_layers: list = [], emb_dropout: float = 0.1, freeze: bool = False, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super().__init__() # build all (num_layers) layers or only some if active_layers != []: self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for x in active_layers]) else: self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout) for _ in range(num_layers)]) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, emb_dropout: float = 0.1, freeze: bool = False, attn_func: str = "softmax", attn_alpha: float = 1.5, pe: bool = True, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability for Transformer layers :param emb_dropout: Is applied to the input (word embeddings). :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() self.layers = nn.ModuleList([ TransformerEncoderLayer(size=hidden_size, ff_size=ff_size, num_heads=num_heads, dropout=dropout, attn_func=attn_func, attn_alpha=attn_alpha) for _ in range(num_layers) ]) self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6) self.pe = PositionalEncoding(hidden_size) if pe else None self.emb_dropout = nn.Dropout(p=emb_dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", hidden_size: int = 1, emb_size: int = 1, num_layers: int = 1, dropout: float = 0., bidirectional: bool = True, freeze: bool = False, **kwargs) -> None: """ Create a new recurrent encoder. :param rnn_type: :param hidden_size: :param emb_size: :param num_layers: :param dropout: :param bidirectional: :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(RecurrentEncoder, self).__init__() self.rnn_input_dropout = torch.nn.Dropout(p=dropout, inplace=False) self.type = rnn_type self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.rnn = rnn(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0.) self._output_size = 2 * hidden_size if bidirectional else hidden_size if freeze: freeze_params(self)
def __init__(self, embedding_dim: int = 64, scale: bool = False, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = False, from_pretrained: bool = False, pretrained_path: str = "", check_embedding=False, **kwargs): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param scale: :param vocab_size: :param padding_idx: :param freeze: freeze the embeddings during training """ super(Embeddings, self).__init__() self.embedding_dim = embedding_dim self.scale = scale self.vocab_size = vocab_size self.from_pretrained = from_pretrained self.check_embedding = check_embedding if from_pretrained: print("using pretrained model") self.weight = torch.load(pretrained_path) print(f"loaded embeddings size {self.weight.shape}") self.lut = nn.Embedding.from_pretrained(self.weight) else: self.lut = nn.Embedding(vocab_size, self.embedding_dim, padding_idx=padding_idx) if freeze: freeze_params(self)
def __init__(self, hidden_size: int = 512, ff_size: int = 2048, num_layers: int = 8, num_heads: int = 4, dropout: float = 0.1, freeze: bool = False, **kwargs): """ Initializes the Transformer. :param hidden_size: hidden size and size of embeddings :param ff_size: position-wise feed-forward layer size. (Typically this is 2*hidden_size.) :param num_layers: number of layers :param num_heads: number of heads for multi-headed attention :param dropout: dropout probability :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(TransformerEncoder, self).__init__() # build all (num_layers) layers layers = [] for _ in range(num_layers): layer = TransformerEncoderLayer( hidden_size, MultiHeadedAttention(num_heads, hidden_size, dropout), PositionwiseFeedForward(hidden_size, ff_size, dropout), dropout) layers.append(layer) self.layers = nn.ModuleList(layers) self.norm = nn.LayerNorm(hidden_size) self.pe = PositionalEncoding(hidden_size, dropout=dropout) self._output_size = hidden_size if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder_output_sizes: dict = None, attention: str = "bahdanau", attn_merge: str = "concat", num_layers: int = 1, dropout: float = 0., hidden_dropout: float = 0., init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, attn_func: str = "softmax", attn_alpha: float = 1.5, gate_func: str = "softmax", gate_alpha: float = 1.5, **kwargs) -> None: """ Todo: document the unique challenges of making an RNN decoder that attends over multiple encoders """ super(MultiHeadRecurrentDecoder, self).__init__(hidden_size, **kwargs) self.hidden_dropout = nn.Dropout(p=hidden_dropout) self.head_names = sorted(encoder_output_sizes) self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding input_size = emb_size + hidden_size if input_feeding else emb_size # the decoder RNN self.rnn = rnn(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.) # combined output sizes of all encoders # this quantity matters for concat attention merging # it also matters if you have a bridge for init_hidden encoder_output_size = sum(encoder_output_sizes.values()) assert attention in ["bahdanau", "luong"], \ "Unknown attention mechanism: %s. Use 'bahdanau' or 'luong'." if attention == "bahdanau": attn_mechanism = partial(MultiAttention, query_size=hidden_size) else: attn_mechanism = MultiAttention self.attention = attn_mechanism(attn_type=attention, head_names=self.head_names, key_sizes=encoder_output_sizes, hidden_size=hidden_size, attn_func=attn_func, attn_alpha=attn_alpha, attn_merge=attn_merge, gate_func=gate_func, gate_alpha=gate_alpha) # to initialize from the final encoder state of last layer assert init_hidden == "bridge", \ "only use bridge with multi-encoder models" self.bridge_layer = nn.Sequential( nn.Linear(encoder_output_size, hidden_size, bias=True), nn.Tanh()) if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", hidden_size: int = 1, emb_size: int = 1, num_layers: int = 1, dropout: float = 0., bidirectional: bool = True, freeze: bool = False, activation: str = "relu", last_activation: str = "None", layer_norm: bool = False, emb_norm: bool = False, same_weights: bool = False, **kwargs) -> None: """ Create a new recurrent encoder. :param rnn_type: :param hidden_size: :param emb_size: :param num_layers: :param dropout: :param bidirectional: :param freeze: freeze the parameters of the encoder during training :param kwargs: """ super(SpeechRecurrentEncoder, self).__init__() self.rnn_input_dropout = torch.nn.Dropout(p=dropout, inplace=False) self.type = rnn_type self.emb_size = emb_size self.lila1 = nn.Linear(emb_size, hidden_size) self.lila2 = nn.Linear(hidden_size, hidden_size) self.same_weights = same_weights if not self.same_weights: self.lila3 = nn.Linear(hidden_size, hidden_size) self.lila4 = nn.Linear(hidden_size, hidden_size) self.activation = activation self.last_activation = last_activation self.conv1 = nn.Sequential( nn.Conv1d(hidden_size, hidden_size, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2, padding=0)) self.conv2 = nn.Sequential( nn.Conv1d(hidden_size, hidden_size, kernel_size=3, stride=1, padding=1), nn.ReLU(), nn.MaxPool1d(kernel_size=2, stride=2, padding=0)) self.layer_norm = layer_norm self.emb_norm = emb_norm if self.layer_norm: self.norm1 = nn.LayerNorm(hidden_size) self.norm2 = nn.LayerNorm(hidden_size) self.norm_out = nn.LayerNorm(2 * hidden_size if bidirectional else hidden_size) if self.emb_norm: self.norm_emb = nn.LayerNorm(emb_size) rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.rnn = rnn( hidden_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0.) self._output_size = 2 * hidden_size if bidirectional else hidden_size if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder: Encoder = None, attention: str = "bahdanau", num_layers: int = 1, vocab_size: int = 0, dropout: float = 0., emb_dropout: float = 0., hidden_dropout: float = 0., init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, **kwargs) -> None: """ Create a recurrent decoder with attention. :param rnn_type: rnn type, valid options: "lstm", "gru" :param emb_size: target embedding size :param hidden_size: size of the RNN :param encoder: encoder connected to this decoder :param attention: type of attention, valid options: "bahdanau", "luong" :param num_layers: number of recurrent layers :param vocab_size: target vocabulary size :param hidden_dropout: Is applied to the input to the attentional layer. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param init_hidden: If "bridge" (default), the decoder hidden states are initialized from a projection of the last encoder state, if "zeros" they are initialized with zeros, if "last" they are identical to the last encoder state (only if they have the same size) :param input_feeding: Use Luong's input feeding. :param freeze: Freeze the parameters of the decoder during training. :param kwargs: """ super().__init__() self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False) self.type = rnn_type self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False) self.hidden_size = hidden_size self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding if self.input_feeding: # Luong-style # combine embedded prev word +attention vector before feeding to rnn self.rnn_input_size = emb_size + hidden_size else: # just feed prev word embedding self.rnn_input_size = emb_size # the decoder RNN self.rnn = rnn(self.rnn_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear(hidden_size + encoder.output_size, hidden_size, bias=True) self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) self._output_size = vocab_size if attention == "bahdanau": self.attention = BahdanauAttention(hidden_size=hidden_size, key_size=encoder.output_size, query_size=hidden_size) elif attention == "luong": self.attention = LuongAttention(hidden_size=hidden_size, key_size=encoder.output_size) else: raise ConfigurationError("Unknown attention mechanism: %s. " "Valid options: 'bahdanau', 'luong'." % attention) self.num_layers = num_layers self.hidden_size = hidden_size # to initialize from the final encoder state of last layer self.init_hidden_option = init_hidden if self.init_hidden_option == "bridge": self.bridge_layer = nn.Linear(encoder.output_size, hidden_size, bias=True) elif self.init_hidden_option == "last": if encoder.output_size != self.hidden_size: if encoder.output_size != 2 * self.hidden_size: # bidirectional raise ConfigurationError( "For initializing the decoder state with the " "last encoder state, their sizes have to match " "(encoder: {} vs. decoder: {})".format( encoder.output_size, self.hidden_size)) if freeze: freeze_params(self)
def __init__(self, type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder: Encoder = None, attention: str = "bahdanau", num_layers: int = 0, vocab_size: int = 0, dropout: float = 0., hidden_dropout: float = 0., bridge: bool = False, input_feeding: bool = True, freeze: bool = False, **kwargs): """ Create a recurrent decoder. If `bridge` is True, the decoder hidden states are initialized from a projection of the encoder states, else they are initialized with zeros. :param type: :param emb_size: :param hidden_size: :param encoder: :param attention: :param num_layers: :param vocab_size: :param dropout: :param hidden_dropout: :param bridge: :param input_feeding: :param freeze: freeze the parameters of the decoder during training :param kwargs: """ super(RecurrentDecoder, self).__init__() self.rnn_input_dropout = torch.nn.Dropout(p=dropout, inplace=False) self.type = type self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False) self.hidden_size = hidden_size rnn = nn.GRU if type == "gru" else nn.LSTM self.input_feeding = input_feeding if self.input_feeding: # Luong-style # combine embedded prev word +attention vector before feeding to rnn self.rnn_input_size = emb_size + hidden_size else: # just feed prev word embedding self.rnn_input_size = emb_size # the decoder RNN self.rnn = rnn(self.rnn_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear( hidden_size + encoder.output_size, hidden_size, bias=True) self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) self.output_size = vocab_size if attention == "bahdanau": self.attention = BahdanauAttention(hidden_size=hidden_size, key_size=encoder.output_size, query_size=hidden_size) elif attention == "luong": self.attention = LuongAttention(hidden_size=hidden_size, key_size=encoder.output_size) else: raise ValueError("Unknown attention mechanism: %s" % attention) self.num_layers = num_layers self.hidden_size = hidden_size # to initialize from the final encoder state of last layer self.bridge = bridge if self.bridge: self.bridge_layer = nn.Linear( encoder.output_size, hidden_size, bias=True) if freeze: freeze_params(self)
def __init__( self, src_vocab: Vocabulary, trg_vocab: Vocabulary, embedding_dim: int = 300, # or 30 scale: bool = False, vocab_size: int = 0, padding_idx: int = 1, freeze: bool = False, **kwargs ): """ Create new embeddings for the vocabulary. Use scaling for the Transformer. :param embedding_dim: :param scale: :param vocab_size: :param padding_idx: :param freeze: freeze the embeddings during training """ super().__init__() self.scale = scale # TODO add support for other languages # fasttext.util.download_model('de', if_exists='ignore') # 30 or 300 np_embedding = f"ft_deen_{embedding_dim}.np" if not os.path.isfile(np_embedding): # if else call to avoid importing fasttext if possible # because fasttext has unmet dependency on gpu nodes import fasttext.util src_ft = fasttext.load_model(f'cc.de.{embedding_dim}.bin') trg_ft = fasttext.load_model(f'cc.en.{embedding_dim}.bin') # Create smaller embeddings, to test on reverse # fasttext.util.reduce_model(src_ft, 30) # src_ft.save_model('cc.en.30.bin') self.embedding_dim = src_ft.get_dimension() vectors = [] for i, word in tqdm(enumerate(src_vocab.itos), desc="adding src vecs"): vectors.append(src_ft.get_word_vector(word)) for i, word in tqdm(enumerate(trg_vocab.itos), desc="adding trg vecs"): vectors.append(trg_ft.get_word_vector(word)) embedding_matrix = np.vstack(vectors) with open(np_embedding, "wb") as np_file: pickle.dump(embedding_matrix, np_file) print(f"Saved joint fasttext embedding matrix as np matrix at {np_embedding}") else: print("Loading saved embedding ...") with open(np_embedding, "rb") as np_file: embedding_matrix = pickle.load(np_file) print("Loaded saved embedding.") self.embedding_dim = embedding_matrix.shape[-1] self.lut = nn.Embedding( len(src_vocab)+len(trg_vocab), self.embedding_dim, padding_idx=trg_vocab.stoi[PAD_TOKEN] ) self.lut.weight = nn.Parameter(data=torch.from_numpy(embedding_matrix).float()) assert self.lut.weight is not None # assert False, self.lut.weight.shape # always freeze pretrained embeddings freeze_params(self)
def __init__(self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder_output_size: int = 0, attention: str = "bahdanau", num_layers: int = 1, vocab_size: int = 0, dropout: float = 0., emb_dropout: float = 0., hidden_dropout: float = 0., init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, attn_func: str = "softmax", attn_alpha: float = 1.5, gen_func: str = "softmax", gen_alpha: float = 1.5, output_bias: bool = False, multi_source: bool = False, head_names: list = None, attn_merge: str = "gate", gate_func: str = "softmax", gate_alpha: float = 1.5, **kwargs) -> None: """ Create a recurrent decoder with attention. :param rnn_type: rnn type, valid options: "lstm", "gru" :param emb_size: target embedding size :param hidden_size: size of the RNN :param encoder_output_size: :param attention: type of attention, valid options: "bahdanau", "luong" :param num_layers: number of recurrent layers :param vocab_size: target vocabulary size :param hidden_dropout: applied to the input to the attentional layer. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param init_hidden: If "bridge" (default), the decoder hidden states are initialized from a projection of the last encoder state, if "zeros" they are initialized with zeros, if "last" they are identical to the last encoder state (only if they have the same size) :param input_feeding: Use Luong's input feeding. :param freeze: Freeze the parameters of the decoder during training. :param kwargs: """ super(RecurrentDecoder, self).__init__(hidden_size, vocab_size, emb_dropout, gen_func=gen_func, gen_alpha=gen_alpha, output_bias=output_bias) self.multi_source = multi_source self.hidden_dropout = nn.Dropout(p=hidden_dropout) self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding input_size = emb_size + hidden_size if input_feeding else emb_size # the decoder RNN self.rnn = rnn(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear(hidden_size + encoder_output_size, hidden_size, bias=True) assert attention in ["bahdanau", "luong"], \ "Unknown attention mechanism: %s. Use 'bahdanau' or 'luong'." if multi_source: attn_mechanism = partial(MultiAttention, attn_type=attention, head_names=head_names, attn_merge=attn_merge, gate_func=gate_func, gate_alpha=gate_alpha) elif attention == "luong": attn_mechanism = LuongAttention else: attn_mechanism = BahdanauAttention if attention == "bahdanau": attn_mechanism = partial(attn_mechanism, query_size=hidden_size) self.attention = attn_mechanism(hidden_size=hidden_size, key_size=encoder_output_size, attn_func=attn_func, attn_alpha=attn_alpha) # init_hidden: "bridge", "zero", "last", or a dictionary describing # an arbitrary-layered MLP assert isinstance(init_hidden, dict) or isinstance(init_hidden, str), \ ''' Specify either a shortcut name ("bridge", "zero", "last") or a dictionary containing a configuration for the bridge layer. ''' if init_hidden == "zero": self.bridge_layer = None # easy-peasy else: if init_hidden == "last": # not actually clear to me if this is necessary assert encoder_output_size in {hidden_size, 2 * hidden_size}, \ "Mismatched hidden sizes (enc: {}, dec: {})".format( encoder_output_size, hidden_size ) if isinstance(init_hidden, str): bridge = init_hidden == "bridge" # 'bridge' and 'last' are shortcuts to specific special cases init_hidden = { "num_layers": 1 if bridge else 0, "activation": "tanh" if bridge else "none", "merge": "cat" } if init_hidden["merge"] == "cat": n_heads = len(head_names) if head_names is not None else 1 bridge_in_size = encoder_output_size * n_heads # for cat else: bridge_in_size = encoder_output_size self.bridge_layer = Bridge(bridge_in_size, hidden_size, lstm=isinstance(self.rnn, nn.LSTM), decoder_layers=self.num_layers, **init_hidden) if freeze: freeze_params(self)
def __init__(self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder_output_size: int = 0, attention: str = "bahdanau", num_layers: int = 1, dropout: float = 0., hidden_dropout: float = 0., init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, attn_func: str = "softmax", attn_alpha: float = 1.5, **kwargs) -> None: """ Create a recurrent decoder with attention. :param rnn_type: rnn type, valid options: "lstm", "gru" :param emb_size: :param hidden_size: :param attention: type of attention, valid options: "bahdanau", "luong" :param num_layers: :param hidden_dropout: applied to the input to the attentional layer. :param dropout: applied between RNN layers. :param emb_dropout: applied to the RNN input (word embeddings). :param init_hidden: If "bridge" (default), the decoder hidden states are initialized from a projection of the last encoder state, if "zeros" they are initialized with zeros, if "last" they are identical to the last encoder state (only if they have the same size) :param input_feeding: Use Luong's input feeding. :param freeze: Freeze the parameters of the decoder during training. :param kwargs: passed to generic Decoder constructor """ super(RecurrentDecoder, self).__init__(hidden_size, **kwargs) self.hidden_dropout = nn.Dropout(p=hidden_dropout) self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding input_size = emb_size + hidden_size if input_feeding else emb_size # the decoder RNN self.rnn = rnn(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear(hidden_size + encoder_output_size, hidden_size, bias=True) assert attention in ["bahdanau", "luong"], \ "Unknown attention mechanism: %s. Use 'bahdanau' or 'luong'." if attention == "bahdanau": attn_mechanism = partial(BahdanauAttention, query_size=hidden_size) else: attn_mechanism = LuongAttention self.attention = attn_mechanism(hidden_size=hidden_size, key_size=encoder_output_size, attn_func=attn_func, attn_alpha=attn_alpha) # to initialize from the final encoder state of last layer assert init_hidden in ["bridge", "zero", "last"] self.init_hidden_option = init_hidden if init_hidden == "bridge": self.bridge_layer = nn.Sequential( nn.Linear(encoder_output_size, hidden_size, bias=True), nn.Tanh()) else: self.bridge_layer = None if init_hidden == "last": out_size = encoder_output_size assert out_size in (hidden_size, 2 * hidden_size), \ "Mismatched hidden sizes (encoder: {}, decoder: {})".format( encoder_output_size, hidden_size ) if freeze: freeze_params(self)