def _context_attention(self): with tf.name_scope('context_attention'): context_attention = BahdanauAttention( self._context, memory_len=self._history_size, mask_value=1e-18) self._encoder_state_with_context = context_attention( self._encoder_state)
def __init__(self, tar_vocab_size, batch_sz, name="gru", embedding_dim=300, learn_embedding=True, embedding_matrix=None, dec_units=128): """Initialize attention based decoder architecture. Arguments: tar_vocab_size {int} -- Size of the target sequence vocabulary size. Keyword Arguments: name {str} -- Name of the recurrent layer. Choices : ['lstm', 'gru'] (default: {"gru"}) embedding_dim {int} -- Size of the token embedding. (default: {300}) learn_embedding {bool} -- Boolean flag as indicator for learning embedding or using a pre-trained done. (default: {True}) embedding_matrix {numpy} -- If using pre-trained embedding. Load pre-trained embedding here. (default: {None}) dec_units {int} -- Number of decoder nodes. (default: {128}) Raises: ValueError: Raise error when wrong model name is passed. """ super(keras.Model, self).__init__() self.batch_sz = batch_sz self.dec_units = dec_units if learn_embedding == True: # Learn embedding as a part of the network self.embedding = keras.layers.Embedding(input_dim=tar_vocab_size, output_dim=embedding_dim, input_length=max_length, mask_zero=True) else: # Use pre-trained embeddings like glove self.embedding = keras.layers.Embedding(input_dim=tar_vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False, input_length=max_length) # Attention layer self.attention = BahdanauAttention(self.dec_units) # Decoder if name == "lstm": self.decoder_layer = keras.layers.LSTM( self.dec_units, return_sequences=True, return_state=False, recurrent_initializer="glorot_uniform") elif name == "gru": self.decoder_layer = keras.layers.GRU( self.dec_units, return_sequences=True, return_state=False, recurrent_initializer="glorot_uniform") else: raise ValueError("Wrong encoder type passed! {}".format(encoder)) # Dense layer self.fc = keras.layers.Dense(tar_vocab_size)
def __init__(self, n_layers, n_word_src, n_word_dst, n_units): super(NStepEncDec, self).__init__() with self.init_scope(): self.embed_src = L.EmbedID(n_word_src, n_units) self.embed_dst = L.EmbedID(n_word_dst, n_units) self.encoder = L.NStepBiLSTM(n_layers, n_units, n_units, 0.1) self.decoder = L.NStepLSTM(n_layers, n_units, n_units, 0.1) self.attention_mechanism = BahdanauAttention(n_units) self.decoder_with_attn = AttentionWrapper(self.decoder, self.attention_mechanism) self.fc = L.Linear(n_units, n_word_dst) self.n_layers = n_layers self.n_units = n_units
def __init__(self, embedding_dim, units, vocab_size): super(RNN_Decoder, self).__init__() self.units = units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(self.units, # Dimension of output space. return_sequences=True, # whether to return the last output in the output sequence or full sequence. return_state=True, # whether to return the last state in addition to the output. recurrent_initializer='glorot_uniform') self.fc1 = tf.keras.layers.Dense(self.units) self.fc2 = tf.keras.layers.Dense(vocab_size) self.attention = BahdanauAttention(self.units)
def _decoder(self): with tf.variable_scope('decoder'): self._initialize_decoder_params() attention_cell = MultiAttentionWrapper( tf.contrib.rnn.GRUCell(self._state_size), BahdanauAttention(self._encoder_outputs, memory_len=self._input_seq_len), BahdanauAttention(self._context, memory_len=self._history_size, mask_value=1e-18)) cell = tf.contrib.rnn.MultiRNNCell( [ # Only first cell has attention attention_cell, # Other cells tf.contrib.rnn.GRUCell(self._state_size), tf.contrib.rnn.GRUCell(self._state_size) ], state_is_tuple=True) decoder_outputs_ta, _, _ = tf.nn.raw_rnn(cell, self._decoder_loop_fn) decoder_outputs = decoder_outputs_ta.stack() tf.summary.histogram('decoder_outputs', decoder_outputs) num_steps, batch_size, decoder_output_size = tf.unstack( tf.shape(decoder_outputs)) self._decoder_logits = tf.reshape( self._output_projection_layer( tf.reshape(decoder_outputs, [-1, decoder_output_size])), [num_steps, batch_size, self._embeddings_shape[0]]) self.decoder_embedding_ids = tf.cast( tf.argmax(self._decoder_logits, 2), tf.int32)
def __init__( self, rnn_type: str = "gru", emb_size: int = 0, hidden_size: int = 0, encoder: Encoder = None, attention: str = "bahdanau", num_layers: int = 1, vocab_size: int = 0, dropout: float = 0.0, emb_dropout: float = 0.0, hidden_dropout: float = 0.0, init_hidden: str = "bridge", input_feeding: bool = True, freeze: bool = False, **kwargs ) -> None: """ Create a recurrent decoder with attention. :param rnn_type: rnn type, valid options: "lstm", "gru" :param emb_size: target embedding size :param hidden_size: size of the RNN :param encoder: encoder connected to this decoder :param attention: type of attention, valid options: "bahdanau", "luong" :param num_layers: number of recurrent layers :param vocab_size: target vocabulary size :param hidden_dropout: Is applied to the input to the attentional layer. :param dropout: Is applied between RNN layers. :param emb_dropout: Is applied to the RNN input (word embeddings). :param init_hidden: If "bridge" (default), the decoder hidden states are initialized from a projection of the last encoder state, if "zeros" they are initialized with zeros, if "last" they are identical to the last encoder state (only if they have the same size) :param input_feeding: Use Luong's input feeding. :param freeze: Freeze the parameters of the decoder during training. :param kwargs: """ super(RecurrentDecoder, self).__init__() self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False) self.type = rnn_type self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False) self.hidden_size = hidden_size self.emb_size = emb_size rnn = nn.GRU if rnn_type == "gru" else nn.LSTM self.input_feeding = input_feeding if self.input_feeding: # Luong-style # combine embedded prev word +attention vector before feeding to rnn self.rnn_input_size = emb_size + hidden_size else: # just feed prev word embedding self.rnn_input_size = emb_size # the decoder RNN self.rnn = rnn( self.rnn_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0, ) # combine output with context vector before output layer (Luong-style) self.att_vector_layer = nn.Linear( hidden_size + encoder.output_size, hidden_size, bias=True ) self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False) self._output_size = vocab_size if attention == "bahdanau": self.attention = BahdanauAttention( hidden_size=hidden_size, key_size=encoder.output_size, query_size=hidden_size, ) elif attention == "luong": self.attention = LuongAttention( hidden_size=hidden_size, key_size=encoder.output_size ) else: raise ValueError( "Unknown attention mechanism: %s. " "Valid options: 'bahdanau', 'luong'." % attention ) self.num_layers = num_layers self.hidden_size = hidden_size # to initialize from the final encoder state of last layer self.init_hidden_option = init_hidden if self.init_hidden_option == "bridge": self.bridge_layer = nn.Linear(encoder.output_size, hidden_size, bias=True) elif self.init_hidden_option == "last": if encoder.output_size != self.hidden_size: if encoder.output_size != 2 * self.hidden_size: # bidirectional raise ValueError( "For initializing the decoder state with the " "last encoder state, their sizes have to match " "(encoder: {} vs. decoder: {})".format( encoder.output_size, self.hidden_size ) ) if freeze: freeze_params(self)