Example #1
0
    def call(self, enc_output, dec_hidden, enc_state, enc_inp,
             enc_extended_inp, dec_inp, batch_oov_len):

        predictions = []
        attentions = []
        p_gens = []
        #print('we wil call attention now')
        context_vector, _ = self.attention(dec_hidden, enc_output)
        for t in range(dec_inp.shape[1]):
            #print('Ok here we are 1')
            dec_x, pred, dec_hidden, context_vector, attn = self.decoder(
                tf.expand_dims(dec_inp[:, t], 1), [dec_hidden, enc_state],
                enc_output, context_vector)  #Changes
            context_vector1, attn1 = self.attention(dec_hidden, enc_output)
            p_gen = self.pointer(context_vector, dec_hidden,
                                 tf.squeeze(dec_x, axis=1))

            predictions.append(pred)
            attentions.append(attn)
            p_gens.append(p_gen)
        final_dists = _calc_final_dist(enc_extended_inp, predictions,
                                       attentions, p_gens, batch_oov_len,
                                       self.params["vocab_size"],
                                       self.params["batch_size"])
        if self.params["mode"] == "train":
            return tf.stack(
                final_dists, 1
            ), dec_hidden  # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
        else:
            return tf.stack(final_dists,
                            1), dec_hidden, context_vector, tf.stack(
                                attentions, 1), tf.stack(p_gens, 1)
    def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp, dec_inp,
             batch_oov_len):

        predictions = []
        attentions = []
        p_gens = []
        context_vector, _ = self.attention(dec_hidden, enc_output)
        for t in range(dec_inp.shape[1]):
            dec_x, pred, dec_hidden = self.decoder(
                tf.expand_dims(dec_inp[:, t], 1), dec_hidden, enc_output,
                context_vector)
            context_vector, attn = self.attention(dec_hidden, enc_output)
            p_gen = self.pointer(context_vector, dec_hidden,
                                 tf.squeeze(dec_x, axis=1))

            predictions.append(pred)
            attentions.append(attn)
            p_gens.append(p_gen)
        final_dists = _calc_final_dist(enc_extended_inp, predictions,
                                       attentions, p_gens, batch_oov_len,
                                       self.params["vocab_size"],
                                       self.params["batch_size"])
        return tf.stack(
            final_dists, 1
        ), dec_hidden  # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
    def call(self, inp, extended_inp, max_oov_len, tar, training,
             enc_padding_mask, look_ahead_mask, dec_padding_mask):

        embed_x = self.embedding(inp)
        embed_dec = self.embedding(tar)

        enc_output = self.encoder(
            embed_x, training,
            enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights, p_gens = self.decoder(
            embed_dec, enc_output, training, look_ahead_mask, dec_padding_mask)

        output = self.final_layer(
            dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
        output = tf.nn.softmax(output)  # (batch_size, tar_seq_len, vocab_size)
        #output = tf.concat([output, tf.zeros((tf.shape(output)[0], tf.shape(output)[1], max_oov_len))], axis=-1) # (batch_size, targ_seq_len, vocab_size+max_oov_len)

        attn_dists = attention_weights['decoder_layer{}_block2'.format(
            self.num_layers
        )]  # (batch_size,num_heads, targ_seq_len, inp_seq_len)
        attn_dists = tf.reduce_sum(
            attn_dists,
            axis=1) / self.num_heads  # (batch_size, targ_seq_len, inp_seq_len)

        final_dists = _calc_final_dist(extended_inp, tf.unstack(output,
                                                                axis=1),
                                       tf.unstack(attn_dists, axis=1),
                                       tf.unstack(p_gens, axis=1), max_oov_len,
                                       self.vocab_size, self.batch_size)
        final_output = tf.stack(final_dists, axis=1)

        return final_output, attention_weights
Example #4
0
    def call(self,
             enc_output,
             dec_hidden,
             enc_inp,
             enc_extended_inp,
             dec_inp,
             batch_oov_len,
             cov_vec,
             stats=None):

        predictions = []
        attentions = []
        p_gens = []
        if self.params["coverage"]:
            cov_features = self.coverage(cov_vec)
        else:
            cov_features = None
        context_vector, _ = self.attention(dec_hidden, enc_output,
                                           cov_features)

        for t in range(dec_inp.shape[1]):
            dec_x, pred, dec_hidden = self.decoder(
                tf.expand_dims(dec_inp[:, t], 1), dec_hidden, enc_output,
                context_vector, stats)

            if self.params["coverage"]:
                cov_features = self.coverage(cov_vec)
            else:
                cov_features = None
            context_vector, attn = self.attention(dec_hidden, enc_output,
                                                  cov_features)
            p_gen = self.pointer(context_vector, dec_hidden,
                                 tf.squeeze(dec_x, axis=1))
            if self.params["coverage"]:
                cov_vec += attn

            attn = tf.squeeze(attn, axis=-1)
            predictions.append(pred)
            attentions.append(attn)
            p_gens.append(p_gen)

        final_dists = _calc_final_dist(enc_extended_inp, predictions,
                                       attentions, p_gens, batch_oov_len,
                                       self.params["vocab_size"],
                                       self.params["batch_size"])
        res = {}
        res["final_dists"] = tf.stack(final_dists, 1)
        res["dec_hidden"] = dec_hidden
        if self.params["coverage"] or self.params["mode"] != "train":
            res["cov_vec"] = cov_vec
            res["attn_weights"] = tf.stack(attentions, 1)
        if self.params["mode"] != "train":
            res["context"] = context_vector
            res["p_gens"] = tf.stack(p_gens, 1)
        return res  # predictions_shape = (batch_size, dec_len, vocab_size) with dec_len = 1 in pred mode
Example #5
0
    def __call__(self,
                 enc_outputs,
                 enc_mask,
                 enc_state,
                 decoder_inputs,
                 batch_max_oov_len=None,
                 encoder_input_with_oov=None,
                 cov_vec=None):
        """
        Attentional feedforward graph .
        We call this method once during training for each batch, and max_dec_len times for decode mode.
        
        Args:
            enc_outputs : 3D tensor, encoder outputs, shape : [batch_size, batch_max_enc_len, 2*hidden_size]
            enc_mask : 2D tensor, encoder sequence mask, shape : [batch_size, batch_max_enc_len]
            decoder_inputs: 3D tensor, decoder inputs, shape : [batch_size, max_dec_len, embed_size]
            batch_max_oov_len : Integer, Maximum number of oov for the current batch, (None if pointer_gen = False)
            encoder_input_with_oov : 2D tensor, encoder input with oovs ids, shape : [batch_size, batch_max_enc_len]
            
            !!! NB : batch_max_enc_len is None when we build graph, and vary during the feedforward with the current batch treated, 
                      it is the maximum length of sequences of the current batch
                      
        Returns : A dictionary
            output : list max_dec_en of 2D tensors of shape [batch_size, vocab_size + batch_max_oov_len (if pointer_gen)]
            last_context_vector : 2D tensor, shape : [batch_size, 2*hidden_size], this will be useful in the decode mode
            dec_state : 2D tensor, decoder last state, shape : [2, batch_size, hidden_size]
            p_gen : max_dec_len-many list of 1D tensors of length[batch_size] (only if pointer_gen is true)
            attention_vec : max_dec_len-many list of 2D tensors of shape [batch_size, batch_max_enc_len] (only if coverage is true)
    """

        if (self.hpm["pointer_gen"]):
            p_gens = [
            ]  # if pointer gen, we add an array to store the probability of each word in the sequences to be generated or pointed on

        attn_dists = [
        ]  # array to store the attention distributions over the enc seq
        dec_state = enc_state  # we init the decoder state with the encoder last state
        outputs = [
        ]  # array to store the final probability distributions (decoded sequence)
        dec_inp = tf.unstack(
            decoder_inputs
        )  # we unstack the decoder input to be able to enumerate over this tensor

        if self.hpm['decode_using_prev']:
            argmax_arr = []
            samples_arr = []
            argmax_logprob_arr = []
            samples_logprob_arr = []

        # nested function
        def attention(dec_state, cov_vec=None):
            """
          Attention mechanism
          
          Args:
              dec_state : previous state of the decoder. shape : [2, batch_size, hidden_size]. For the first step, it corresponds to the encoder last state
              cov_vec : only if coverage is True (default None).  shape : [batch_size, <batch_max_enc_len>]. The previous coverage vector.
              
          Returns:
              attn_vec : 2D tensor, the attention vector at time step t. shape : [batch_size, <batch_max_enc_len>]
              context_vector : 2D tensor, shape: [batch_size, 2*hidden_size]
              cov_vec : 2D tensor, shape : [batch_size, <batch_max_enc_len>], the current coverage vector
      """
            if (self.hpm["coverage"]):
                with tf.variable_scope('coverage', reuse=tf.AUTO_REUSE):
                    w_c = tf.get_variable(
                        "w_c", [1, 1, 1, self.hpm['attn_hidden_size']]
                    )  # we add additional parameters for the coverage vector linear transf.

                cov_features = tf.expand_dims(
                    tf.expand_dims(cov_vec, axis=2), axis=2
                )  # given that the encoder max length is unknown and variable, we cannot just apply a
                cov_features = tf.nn.conv2d(
                    cov_features, w_c, [1, 1, 1, 1], "SAME"
                )  # linear transformation as above. To avoid this issue, we can apply a convolution layer
                # which will transform the cov vector as a simple linear transf. would.

                # e = V*tanh(w_h*h + w_s*s + w_c*c ) (the last term, only is coverage = True)
                # attention weights all over the encoder input sequence
                # shape : [batch_size, <batch_max_enc_len>, 1]
                e = tf.nn.tanh(
                    self.w_h(enc_outputs) +
                    tf.expand_dims(self.w_s(dec_state.c), axis=1) +
                    tf.squeeze(cov_features, [2]))
            else:
                e = tf.nn.tanh(
                    self.w_h(enc_outputs) +
                    tf.expand_dims(self.w_s(dec_state.c), axis=1))
            e = self.v(e)

            # we take off the last dimension which equals 1
            e = tf.reshape(e, [e.get_shape().as_list()[0], -1
                               ])  # shape : [batch_size, <batch_max_enc_len>]

            attn_vec = tf.nn.softmax(
                e, axis=-1
            )  # we apply a softmax on the attention weights to normalize them and obtain the attention vector.
            attn_vec = apply_mask_normalize(
                attn_vec, enc_mask
            )  # Given that the input is padded with <PAD> token, the attentions weights over those tokens
            # are not relevant, we apply the encoder input masks on the attention vectors to drop those 'irrelevant' attention weights
            # and finally we re-normalize the attention weights to obtain probability distributions

            # context vector computation
            # we multiply the encoder outputs by the attention vector weigths (a weight for each output vector, when we consider only one sequence for the example)
            weighted_enc_outputs = tf.multiply(
                enc_outputs, tf.expand_dims(attn_vec, axis=-1)
            )  # context vector at time step t, shape : [batch_size, ]
            context_vec = tf.reduce_sum(weighted_enc_outputs, axis=1)

            if self.hpm['coverage']:
                cov_vec = cov_vec + attn_vec  # we update the coverage

            return attn_vec, context_vec, cov_vec
            # end of nested function

        with tf.variable_scope('attention_decoder', reuse=tf.AUTO_REUSE):
            # we compute the initial context vector
            _, context_vec, _ = attention(dec_state, cov_vec)
            timesteps = self.hpm['max_dec_len']
            decoder_input = dec_inp[0]
            a = 0
            if not self.hpm['decode_using_prev']:
                a = 1
            for i in range(a, timesteps):
                # for each item in the decoder inputs (this loops only once for decode mode)

                # concatenation of input (previous word) and context vector at timestep t
                new_dec_inp = tf.concat(
                    [decoder_input, context_vec],
                    axis=-1)  # shape : [batch_size, embed_size+2*hidden_size]
                new_dec_inp = self.w_dec(
                    new_dec_inp)  #shape : [batch_size, embed_size]

                # We apply the LSTM decoder on the new input
                dec_output, dec_state = self.decoder(
                    tf.expand_dims(new_dec_inp, axis=0), dec_state
                )  # dec_output shape : [1, batch_size, hidden_size]
                # dec_state shape : [2, batch_size, hidden_size] (2 for the state c and the last hidden output h)
                # attention vector of the current step, context vector for the next step
                # we update the coverage vector
                attn_vec, context_vec, cov_vec = attention(dec_state, cov_vec)
                attn_dists.append(attn_vec)

                dec_output = tf.reshape(
                    dec_output, [-1, dec_output.get_shape().as_list()[-1]
                                 ])  # shape : [batch_size, hidden_size]
                dec_output = self.w_out(
                    dec_output)  # shape : [batch_size, vocab_size]
                vocab_dist = dec_output

                if not self.hpm['pointer_gen']:
                    outputs.append(
                        vocab_dist
                    )  # we do not apply yet the softmax function because this function is integrated in some futures ops like the loss function
                else:
                    # if pointer_gen=True, we need to compute the softmax function because of the scatter op with the attention distribution
                    outputs.append(tf.nn.softmax(dec_output, axis=-1))
                    state = tf.concat([dec_state.c, dec_state.h], axis=1)

                    #p_gen computation with the current concatenated state, context vector and the decoder input
                    p_gen = tf.nn.sigmoid(
                        self.w_c_reduce(context_vec) + self.w_s_reduce(state) +
                        self.w_i_reduce(new_dec_inp)
                    )  # shape : [batch_size, 1]
                    p_gens.append(p_gen)

            if self.hpm['pointer_gen']:
                # we apply the scatter op between the output distibutions (over the vocabulary) with the attention distributions
                outputs = _calc_final_dist(encoder_input_with_oov, outputs,
                                           attn_dists, p_gens,
                                           batch_max_oov_len, self.hpm)

            if not self.hpm['decode_using_prev']:
                decoder_input = dec_inp[i]
            else:

                batch_nums = tf.range(0,
                                      limit=self.hpm['batch_size'],
                                      dtype=tf.int64)
                argmax_seqs = []
                argmax_seqs_log_probs = []
                for i, x in enumerate(outputs):
                    max_ids = tf.argmax(x, axis=-1)
                    indices = tf.stack((batch_nums, max_ids), axis=-1)
                    log_probs = tf.gather_nd(x, indices)
                    argmax_seqs.append(max_ids)
                    argmax_seqs_log_probs.append(log_probs)

                soft_outputs = tf.stack(outputs)
                if not self.hpm['pointer_gen']:
                    soft_outputs = tf.softmax(soft_outputs)

                argmax_seqs = tf.stack(argmax_seqs)
                argmax_seqs_log_probs = tf.stack(argmax_seqs_log_probs)

                sampler = tf.distributions.Categorical(logits=soft_outputs)
                samples = sampler.sample()
                samples_log_probs = sampler.log_prob(samples)
                samples_log_probs = tf.identity(samples_log_probs)

                argmax_arr.append(argmax_seqs)
                argmax_logprob_arr.append(argmax_seqs_log_probs)
                samples_arr.append(samples)
                samples_logprob_arr.append(samples_log_probs)

                decoder_input = samples

        if self.hpm['decode_using_prev']:
            argmax_arr = tf.stack(argmax_arr)
            argmax_logprob_arr = tf.stack(argmax_logprob_arr)
            samples_arr = tf.stack(samples_arr)
            samples_logprob_arr = tf.stack(samples_logprob_arr)

        dic = {
            'output': outputs,
            'last_context_vector': context_vec,
            'dec_state': dec_state,
            'attention_vec': attn_dists
        }
        if (self.hpm['pointer_gen']):
            dic['p_gen'] = p_gens
        if (self.hpm['coverage']):
            dic['coverage'] = cov_vec

        if self.hpm['decode_using_prev']:
            dic.update({
                "argmax_seqs": argmax_arr,
                "argmax_log_probs": argmax_logprob_arr,
                "samples_seqs": samples_arr,
                "samples_log_probs": samples_logprob_arr
            })

        return dic