def call(self, enc_output, dec_hidden, enc_state, enc_inp, enc_extended_inp, dec_inp, batch_oov_len): predictions = [] attentions = [] p_gens = [] #print('we wil call attention now') context_vector, _ = self.attention(dec_hidden, enc_output) for t in range(dec_inp.shape[1]): #print('Ok here we are 1') dec_x, pred, dec_hidden, context_vector, attn = self.decoder( tf.expand_dims(dec_inp[:, t], 1), [dec_hidden, enc_state], enc_output, context_vector) #Changes context_vector1, attn1 = self.attention(dec_hidden, enc_output) p_gen = self.pointer(context_vector, dec_hidden, tf.squeeze(dec_x, axis=1)) predictions.append(pred) attentions.append(attn) p_gens.append(p_gen) final_dists = _calc_final_dist(enc_extended_inp, predictions, attentions, p_gens, batch_oov_len, self.params["vocab_size"], self.params["batch_size"]) if self.params["mode"] == "train": return tf.stack( final_dists, 1 ), dec_hidden # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode else: return tf.stack(final_dists, 1), dec_hidden, context_vector, tf.stack( attentions, 1), tf.stack(p_gens, 1)
def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp, batch_oov_len): predictions = [] attentions = [] p_gens = [] context_vector, _ = self.attention(dec_hidden, enc_output) for t in range(dec_inp.shape[1]): dec_x, pred, dec_hidden = self.decoder( tf.expand_dims(dec_inp[:, t], 1), dec_hidden, enc_output, context_vector) context_vector, attn = self.attention(dec_hidden, enc_output) p_gen = self.pointer(context_vector, dec_hidden, tf.squeeze(dec_x, axis=1)) predictions.append(pred) attentions.append(attn) p_gens.append(p_gen) final_dists = _calc_final_dist(enc_extended_inp, predictions, attentions, p_gens, batch_oov_len, self.params["vocab_size"], self.params["batch_size"]) return tf.stack( final_dists, 1 ), dec_hidden # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
def call(self, inp, extended_inp, max_oov_len, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): embed_x = self.embedding(inp) embed_dec = self.embedding(tar) enc_output = self.encoder( embed_x, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model) # dec_output.shape == (batch_size, tar_seq_len, d_model) dec_output, attention_weights, p_gens = self.decoder( embed_dec, enc_output, training, look_ahead_mask, dec_padding_mask) output = self.final_layer( dec_output) # (batch_size, tar_seq_len, target_vocab_size) output = tf.nn.softmax(output) # (batch_size, tar_seq_len, vocab_size) #output = tf.concat([output, tf.zeros((tf.shape(output)[0], tf.shape(output)[1], max_oov_len))], axis=-1) # (batch_size, targ_seq_len, vocab_size+max_oov_len) attn_dists = attention_weights['decoder_layer{}_block2'.format( self.num_layers )] # (batch_size,num_heads, targ_seq_len, inp_seq_len) attn_dists = tf.reduce_sum( attn_dists, axis=1) / self.num_heads # (batch_size, targ_seq_len, inp_seq_len) final_dists = _calc_final_dist(extended_inp, tf.unstack(output, axis=1), tf.unstack(attn_dists, axis=1), tf.unstack(p_gens, axis=1), max_oov_len, self.vocab_size, self.batch_size) final_output = tf.stack(final_dists, axis=1) return final_output, attention_weights
def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp, batch_oov_len, cov_vec, stats=None): predictions = [] attentions = [] p_gens = [] if self.params["coverage"]: cov_features = self.coverage(cov_vec) else: cov_features = None context_vector, _ = self.attention(dec_hidden, enc_output, cov_features) for t in range(dec_inp.shape[1]): dec_x, pred, dec_hidden = self.decoder( tf.expand_dims(dec_inp[:, t], 1), dec_hidden, enc_output, context_vector, stats) if self.params["coverage"]: cov_features = self.coverage(cov_vec) else: cov_features = None context_vector, attn = self.attention(dec_hidden, enc_output, cov_features) p_gen = self.pointer(context_vector, dec_hidden, tf.squeeze(dec_x, axis=1)) if self.params["coverage"]: cov_vec += attn attn = tf.squeeze(attn, axis=-1) predictions.append(pred) attentions.append(attn) p_gens.append(p_gen) final_dists = _calc_final_dist(enc_extended_inp, predictions, attentions, p_gens, batch_oov_len, self.params["vocab_size"], self.params["batch_size"]) res = {} res["final_dists"] = tf.stack(final_dists, 1) res["dec_hidden"] = dec_hidden if self.params["coverage"] or self.params["mode"] != "train": res["cov_vec"] = cov_vec res["attn_weights"] = tf.stack(attentions, 1) if self.params["mode"] != "train": res["context"] = context_vector res["p_gens"] = tf.stack(p_gens, 1) return res # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
def __call__(self, enc_outputs, enc_mask, enc_state, decoder_inputs, batch_max_oov_len=None, encoder_input_with_oov=None, cov_vec=None): """ Attentional feedforward graph . We call this method once during training for each batch, and max_dec_len times for decode mode. Args: enc_outputs : 3D tensor, encoder outputs, shape : [batch_size, batch_max_enc_len, 2*hidden_size] enc_mask : 2D tensor, encoder sequence mask, shape : [batch_size, batch_max_enc_len] decoder_inputs: 3D tensor, decoder inputs, shape : [batch_size, max_dec_len, embed_size] batch_max_oov_len : Integer, Maximum number of oov for the current batch, (None if pointer_gen = False) encoder_input_with_oov : 2D tensor, encoder input with oovs ids, shape : [batch_size, batch_max_enc_len] !!! NB : batch_max_enc_len is None when we build graph, and vary during the feedforward with the current batch treated, it is the maximum length of sequences of the current batch Returns : A dictionary output : list max_dec_en of 2D tensors of shape [batch_size, vocab_size + batch_max_oov_len (if pointer_gen)] last_context_vector : 2D tensor, shape : [batch_size, 2*hidden_size], this will be useful in the decode mode dec_state : 2D tensor, decoder last state, shape : [2, batch_size, hidden_size] p_gen : max_dec_len-many list of 1D tensors of length[batch_size] (only if pointer_gen is true) attention_vec : max_dec_len-many list of 2D tensors of shape [batch_size, batch_max_enc_len] (only if coverage is true) """ if (self.hpm["pointer_gen"]): p_gens = [ ] # if pointer gen, we add an array to store the probability of each word in the sequences to be generated or pointed on attn_dists = [ ] # array to store the attention distributions over the enc seq dec_state = enc_state # we init the decoder state with the encoder last state outputs = [ ] # array to store the final probability distributions (decoded sequence) dec_inp = tf.unstack( decoder_inputs ) # we unstack the decoder input to be able to enumerate over this tensor if self.hpm['decode_using_prev']: argmax_arr = [] samples_arr = [] argmax_logprob_arr = [] samples_logprob_arr = [] # nested function def attention(dec_state, cov_vec=None): """ Attention mechanism Args: dec_state : previous state of the decoder. shape : [2, batch_size, hidden_size]. For the first step, it corresponds to the encoder last state cov_vec : only if coverage is True (default None). shape : [batch_size, <batch_max_enc_len>]. The previous coverage vector. Returns: attn_vec : 2D tensor, the attention vector at time step t. shape : [batch_size, <batch_max_enc_len>] context_vector : 2D tensor, shape: [batch_size, 2*hidden_size] cov_vec : 2D tensor, shape : [batch_size, <batch_max_enc_len>], the current coverage vector """ if (self.hpm["coverage"]): with tf.variable_scope('coverage', reuse=tf.AUTO_REUSE): w_c = tf.get_variable( "w_c", [1, 1, 1, self.hpm['attn_hidden_size']] ) # we add additional parameters for the coverage vector linear transf. cov_features = tf.expand_dims( tf.expand_dims(cov_vec, axis=2), axis=2 ) # given that the encoder max length is unknown and variable, we cannot just apply a cov_features = tf.nn.conv2d( cov_features, w_c, [1, 1, 1, 1], "SAME" ) # linear transformation as above. To avoid this issue, we can apply a convolution layer # which will transform the cov vector as a simple linear transf. would. # e = V*tanh(w_h*h + w_s*s + w_c*c ) (the last term, only is coverage = True) # attention weights all over the encoder input sequence # shape : [batch_size, <batch_max_enc_len>, 1] e = tf.nn.tanh( self.w_h(enc_outputs) + tf.expand_dims(self.w_s(dec_state.c), axis=1) + tf.squeeze(cov_features, [2])) else: e = tf.nn.tanh( self.w_h(enc_outputs) + tf.expand_dims(self.w_s(dec_state.c), axis=1)) e = self.v(e) # we take off the last dimension which equals 1 e = tf.reshape(e, [e.get_shape().as_list()[0], -1 ]) # shape : [batch_size, <batch_max_enc_len>] attn_vec = tf.nn.softmax( e, axis=-1 ) # we apply a softmax on the attention weights to normalize them and obtain the attention vector. attn_vec = apply_mask_normalize( attn_vec, enc_mask ) # Given that the input is padded with <PAD> token, the attentions weights over those tokens # are not relevant, we apply the encoder input masks on the attention vectors to drop those 'irrelevant' attention weights # and finally we re-normalize the attention weights to obtain probability distributions # context vector computation # we multiply the encoder outputs by the attention vector weigths (a weight for each output vector, when we consider only one sequence for the example) weighted_enc_outputs = tf.multiply( enc_outputs, tf.expand_dims(attn_vec, axis=-1) ) # context vector at time step t, shape : [batch_size, ] context_vec = tf.reduce_sum(weighted_enc_outputs, axis=1) if self.hpm['coverage']: cov_vec = cov_vec + attn_vec # we update the coverage return attn_vec, context_vec, cov_vec # end of nested function with tf.variable_scope('attention_decoder', reuse=tf.AUTO_REUSE): # we compute the initial context vector _, context_vec, _ = attention(dec_state, cov_vec) timesteps = self.hpm['max_dec_len'] decoder_input = dec_inp[0] a = 0 if not self.hpm['decode_using_prev']: a = 1 for i in range(a, timesteps): # for each item in the decoder inputs (this loops only once for decode mode) # concatenation of input (previous word) and context vector at timestep t new_dec_inp = tf.concat( [decoder_input, context_vec], axis=-1) # shape : [batch_size, embed_size+2*hidden_size] new_dec_inp = self.w_dec( new_dec_inp) #shape : [batch_size, embed_size] # We apply the LSTM decoder on the new input dec_output, dec_state = self.decoder( tf.expand_dims(new_dec_inp, axis=0), dec_state ) # dec_output shape : [1, batch_size, hidden_size] # dec_state shape : [2, batch_size, hidden_size] (2 for the state c and the last hidden output h) # attention vector of the current step, context vector for the next step # we update the coverage vector attn_vec, context_vec, cov_vec = attention(dec_state, cov_vec) attn_dists.append(attn_vec) dec_output = tf.reshape( dec_output, [-1, dec_output.get_shape().as_list()[-1] ]) # shape : [batch_size, hidden_size] dec_output = self.w_out( dec_output) # shape : [batch_size, vocab_size] vocab_dist = dec_output if not self.hpm['pointer_gen']: outputs.append( vocab_dist ) # we do not apply yet the softmax function because this function is integrated in some futures ops like the loss function else: # if pointer_gen=True, we need to compute the softmax function because of the scatter op with the attention distribution outputs.append(tf.nn.softmax(dec_output, axis=-1)) state = tf.concat([dec_state.c, dec_state.h], axis=1) #p_gen computation with the current concatenated state, context vector and the decoder input p_gen = tf.nn.sigmoid( self.w_c_reduce(context_vec) + self.w_s_reduce(state) + self.w_i_reduce(new_dec_inp) ) # shape : [batch_size, 1] p_gens.append(p_gen) if self.hpm['pointer_gen']: # we apply the scatter op between the output distibutions (over the vocabulary) with the attention distributions outputs = _calc_final_dist(encoder_input_with_oov, outputs, attn_dists, p_gens, batch_max_oov_len, self.hpm) if not self.hpm['decode_using_prev']: decoder_input = dec_inp[i] else: batch_nums = tf.range(0, limit=self.hpm['batch_size'], dtype=tf.int64) argmax_seqs = [] argmax_seqs_log_probs = [] for i, x in enumerate(outputs): max_ids = tf.argmax(x, axis=-1) indices = tf.stack((batch_nums, max_ids), axis=-1) log_probs = tf.gather_nd(x, indices) argmax_seqs.append(max_ids) argmax_seqs_log_probs.append(log_probs) soft_outputs = tf.stack(outputs) if not self.hpm['pointer_gen']: soft_outputs = tf.softmax(soft_outputs) argmax_seqs = tf.stack(argmax_seqs) argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs) sampler = tf.distributions.Categorical(logits=soft_outputs) samples = sampler.sample() samples_log_probs = sampler.log_prob(samples) samples_log_probs = tf.identity(samples_log_probs) argmax_arr.append(argmax_seqs) argmax_logprob_arr.append(argmax_seqs_log_probs) samples_arr.append(samples) samples_logprob_arr.append(samples_log_probs) decoder_input = samples if self.hpm['decode_using_prev']: argmax_arr = tf.stack(argmax_arr) argmax_logprob_arr = tf.stack(argmax_logprob_arr) samples_arr = tf.stack(samples_arr) samples_logprob_arr = tf.stack(samples_logprob_arr) dic = { 'output': outputs, 'last_context_vector': context_vec, 'dec_state': dec_state, 'attention_vec': attn_dists } if (self.hpm['pointer_gen']): dic['p_gen'] = p_gens if (self.hpm['coverage']): dic['coverage'] = cov_vec if self.hpm['decode_using_prev']: dic.update({ "argmax_seqs": argmax_arr, "argmax_log_probs": argmax_logprob_arr, "samples_seqs": samples_arr, "samples_log_probs": samples_logprob_arr }) return dic