Ejemplo n.º 1
0
    def decode_multi_head(self, encoded_rep, p_encode, masks, labels, config,
                          dropout):
        encoded_question, encoded_passage = encoded_rep
        # enp=encoded_passage

        enp = multihead_attention(queries=encoded_passage,
                                  keys=encoded_passage,
                                  values=encoded_passage,
                                  num_heads=config.head_num)
        # print('enp:',enp)
        enp = tf.nn.dropout(enp, dropout)

        logits = self.run_multi_head_answer_ptr([encoded_question, enp],
                                                p_encode, masks, labels,
                                                config)
        return logits
Ejemplo n.º 2
0
    def cns_encoder(self, cns_code, seq_len, reuse=False):
        with tf.variable_scope("cns_encoder"):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            # src_masks
            src_masks = tf.math.equal(cns_code, 0)  # (N, T1)

            # embedding
            embedding_encoder = tf.get_variable(
                "embedding_encoder",
                [self.cns_vocab_size, self.cns_embedding_size])
            enc = tf.nn.embedding_lookup(embedding_encoder,
                                         cns_code)  # (N, T1, d_model)
            enc *= self.cns_embedding_size**0.5  # scale

            enc += positional_encoding(enc, self.font_len)
            enc = tf.layers.dropout(enc, 0.3, training=True)

            # Blocks
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              key_masks=src_masks,
                                              num_heads=self.num_heads,
                                              dropout_rate=0.3,
                                              training=True,
                                              causality=False)
                    # feed forward
                    enc = ff(enc,
                             num_units=[self.d_ff, self.cns_embedding_size])
        memory = enc
        return memory
    def train_mode(self, vocab, encoder_dim, encoder_states, encoder_features,
                   passage_word_idx, passage_mask, init_state, decoder_inputs,
                   answer_batch, loss_weights, mode_gen, memory):
        '''
        encoder_dim: int-valued
        encoder_states: [batch_size, passage_len, encoder_dim].
        passage_word_idx: [batch_size, passage_len] int32
        passage_mask: [batch_size, passage_len] 0/1
        init_state: Tuple of [batch_size, gen_hidden_size]
        decoder_inputs: [batch_size, max_dec_steps].
        answer_batch: [batch_size, max_dec_steps]
        '''
        options = self.options

        input_shape = tf.shape(encoder_states)
        batch_size = input_shape[0]
        passage_len = input_shape[1]

        # map decoder inputs to word embeddings
        # print(tf.shape(decoder_inputs))
        decoder_inputs = tf.unstack(decoder_inputs,
                                    axis=1)  # max_enc_steps * [batch_size]
        answer_batch_unstack = tf.unstack(answer_batch, axis=1)

        ## Get embedding for dec inputs
        decoder_inputs_emb = self.embedding_lookup(decoder_inputs)
        answer_batch_emb = self.embedding_lookup(answer_batch_unstack)
        ## Position encoding
        decoder_inputs_emb += positional_encoding(decoder_inputs_emb,
                                                  options.max_answer_len)
        answer_batch_emb += positional_encoding(answer_batch_emb,
                                                options.max_answer_len)
        ## Position encoding

        ## Dropout
        decoder_inputs_emb = tf.nn.dropout(decoder_inputs_emb,
                                           (1 - options.dropout_rate))
        answer_batch_emb = tf.nn.dropout(answer_batch_emb,
                                         (1 - options.dropout_rate))
        ## Dropout

        # Blocks
        for i in range(options.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # Masked self-attention (Note that causality is True at this time)
                dec = multihead_attention(queries=decoder_inputs_emb,
                                          keys=decoder_inputs_emb,
                                          values=decoder_inputs_emb,
                                          num_heads=options.num_heads,
                                          dropout_rate=options.dropout_rate,
                                          training=True,
                                          causality=True,
                                          scope="self_attention")

                # Vanilla attention
                dec = multihead_attention(queries=dec,
                                          keys=memory,
                                          values=memory,
                                          num_heads=options.num_heads,
                                          dropout_rate=options.dropout_rate,
                                          training=True,
                                          causality=False,
                                          scope="vanilla_attention")
                ### Feed Forward
                dec = ff(dec, num_units=[options.d_ff, options.d_model])

        # initialize all the variables
        state_t_1 = init_state
        context_t_1 = tf.zeros([batch_size, encoder_dim])
        coverage_t_1 = None

        # store variables from each time-step
        coverages = []
        attn_dists = []
        p_gens = []
        vocab_scores = []
        sampled_words = []
        self.encoder_features = encoder_features
        with variable_scope.variable_scope("attention_decoder"):
            # Get the weight vectors v and W_c (W_c is for coverage)
            v = variable_scope.get_variable("v", [options.attention_vec_size])
            v = tf.expand_dims(tf.expand_dims(v, axis=0), axis=0)
            w_c = None
            if options.use_coverage:
                with variable_scope.variable_scope("coverage"):
                    w_c = variable_scope.get_variable(
                        "w_c", [options.attention_vec_size])
                    w_c = tf.expand_dims(tf.expand_dims(w_c, axis=0), axis=0)

            # For each step, dec_input => lstm_output => vocab_score
            # wordidx_t = decoder_inputs_emb[0] # [batch_size] int32
            word_t = decoder_inputs_emb[0]  # [batch_size] int32
            for i in range(options.max_answer_len):
                # wordidx_t = decoder_inputs_emb[i] # the wordidx_t must from decoder_inputs for phrase model
                word_t = decoder_inputs_emb[
                    i]  # the wordidx_t must from decoder_inputs for phrase model
                # word_t = self.embedding_lookup(wordidx_t)

                if i > 0:
                    variable_scope.get_variable_scope().reuse_variables()

                (state_t, context_t, coverage_t, attn_dist_t,
                 p_gen_t, output_t) = self.one_step_decoder(
                     state_t_1, context_t_1, coverage_t_1, word_t,
                     encoder_states, self.encoder_features, passage_word_idx,
                     passage_mask, v, w_c, vocab)
                coverages.append(coverage_t)
                attn_dists.append(attn_dist_t)
                p_gens.append(p_gen_t)
                vocab_scores.append(output_t)  # The vocabulary distributions.

                state_t_1 = state_t
                context_t_1 = context_t
                coverage_t_1 = coverage_t

                if mode_gen == 'greedy':
                    wordidx_t = tf.argmax(output_t, 1)  # [batch_size]
                    wordidx_t = tf.reshape(wordidx_t, [-1])  # [batch_size]
                elif mode_gen == 'sample':
                    log_score_t = tf.log(output_t)  # [batch_size, vsize]
                    wordidx_t = tf.multinomial(log_score_t,
                                               1)  # [batch_size, 1]
                    wordidx_t = tf.reshape(wordidx_t, [-1])  # [batch_size]
                elif mode_gen in (
                        'ce_train',
                        'loss',
                ):
                    wordidx_t = answer_batch_unstack[i]
                elif mode_gen == 'transformer':
                    wordidx_t = answer_batch_unstack[i]
                else:
                    assert False, 'unknown generating mode %s' % mode_gen
                sampled_words.append(wordidx_t)

        if len(sampled_words) != 0:
            sampled_words = tf.stack(sampled_words,
                                     axis=1)  # [batch_size, max_dec_steps]

        vocab_scores = tf.stack(vocab_scores,
                                axis=1)  # [batch_size, max_dec_steps, vocab]
        # calculating loss
        self._loss = None
        if mode_gen in ('ce_train', 'loss', 'transformer'):
            xent = CE_loss(vocab_scores, answer_batch,
                           loss_weights)  # [batch_size]
            if mode_gen == 'loss':
                xent *= self.placeholders.reward  # multiply with rewards
            self._loss = tf.reduce_mean(xent)
            # Calculate coverage loss from the attention distributions
            if options.use_coverage:
                with tf.variable_scope('coverage_loss'):
                    self._coverage_loss = _coverage_loss(
                        attn_dists, loss_weights)
                self._loss = self._loss + options.cov_loss_wt * self._coverage_loss

        # accuracy is calculated only under 'ce_train', where true answer is given
        if mode_gen in ('ce_train', 'loss', 'transformer'):
            accuracy = _mask_and_accuracy(vocab_scores, answer_batch,
                                          loss_weights)
            return accuracy, self._loss, sampled_words
        else:
            return None, self._loss, sampled_words
Ejemplo n.º 4
0
    def encode(self, is_training=True):
        options = self.options

        # ======word representation layer======
        in_passage_repres = []
        input_dim = 0
        if options.with_word and self.word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.variable_scope("embedding"), tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(self.word_vocab.word_vecs),
                    dtype=tf.float32)

            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding, self.in_passage_words)

            ## Position encoding
            # print('in_passage_word_repres: ', tf.shape(in_passage_word_repres))
            in_passage_word_repres += positional_encoding(
                in_passage_word_repres, options.max_answer_len)
            # print('in_passage_word_repres: ', tf.shape(in_passage_word_repres)[2])
            ## Position encoding

            # [batch_size, passage_len, word_dim]
            in_passage_repres.append(in_passage_word_repres)
            # print('in_passage_repres: ', tf.shape(in_passage_repres))

            input_shape = tf.shape(self.in_passage_words)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            input_dim += self.word_vocab.word_dim

        if options.with_char and self.char_vocab is not None:
            input_shape = tf.shape(self.in_passage_chars)
            batch_size = input_shape[0]
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = self.char_vocab.word_dim
            self.char_embedding = tf.get_variable(
                "char_embedding",
                initializer=tf.constant(self.char_vocab.word_vecs),
                dtype=tf.float32)
            in_passage_char_repres = tf.nn.embedding_lookup(
                self.char_embedding, self.in_passage_chars
            )  # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(
                in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            with tf.variable_scope('char_lstm'):
                # lstm cell
                char_lstm_cell = tf.contrib.rnn.BasicLSTMCell(
                    options.char_lstm_dim)
                # dropout
                if is_training:
                    char_lstm_cell = tf.contrib.rnn.DropoutWrapper(
                        char_lstm_cell,
                        output_keep_prob=(1 - options.dropout_rate))
                char_lstm_cell = tf.contrib.rnn.MultiRNNCell([char_lstm_cell])
                # passage representation
                passage_char_outputs = tf.nn.dynamic_rnn(
                    char_lstm_cell,
                    in_passage_char_repres,
                    sequence_length=passage_char_lengths,
                    dtype=tf.float32)[0]
                # [batch_size*question_len, q_char_len, char_lstm_dim]
                passage_char_outputs = collect_final_step_lstm(
                    passage_char_outputs, passage_char_lengths - 1)
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, options.char_lstm_dim])

            in_passage_repres.append(passage_char_outputs)
            input_dim += options.char_lstm_dim

        in_passage_repres = tf.concat(in_passage_repres,
                                      2)  # [batch_size, passage_len, dim]

        if options.compress_input:  # compress input word vector into smaller vectors
            w_compress = tf.get_variable(
                "w_compress_input", [input_dim, options.compress_input_dim],
                dtype=tf.float32)
            b_compress = tf.get_variable("b_compress_input",
                                         [options.compress_input_dim],
                                         dtype=tf.float32)

            in_passage_repres = tf.reshape(in_passage_repres, [-1, input_dim])
            in_passage_repres = tf.matmul(in_passage_repres,
                                          w_compress) + b_compress
            in_passage_repres = tf.tanh(in_passage_repres)
            in_passage_repres = tf.reshape(
                in_passage_repres,
                [batch_size, passage_len, options.compress_input_dim])
            input_dim = options.compress_input_dim

        in_passage_repres = tf.nn.dropout(in_passage_repres,
                                          (1 - options.dropout_rate))
        # if is_training:
        #     in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate))
        # else:
        #     in_passage_repres = tf.multiply(in_passage_repres, (1 - options.dropout_rate))

        passage_mask = tf.sequence_mask(
            self.passage_lengths, passage_len,
            dtype=tf.float32)  # [batch_size, passage_len]

        ## Blocks
        for i in range(options.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=in_passage_word_repres,
                                          keys=in_passage_repres,
                                          values=in_passage_repres,
                                          num_heads=options.num_heads,
                                          dropout_rate=options.dropout_rate,
                                          training=is_training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[options.d_ff, options.d_model])
        ## Blocks
        memory = enc
        # sequential context matching
        passage_forward = None
        passage_backward = None
        all_passage_representation = []
        passage_dim = 0
        with_lstm = True
        if with_lstm:
            with tf.variable_scope('biLSTM'):
                # cur_in_passage_repres = in_passage_repres
                cur_in_passage_repres = enc
                for i in xrange(options.context_layer_num):
                    with tf.variable_scope('layer-{}'.format(i)):
                        with tf.variable_scope('context_represent'):
                            # parameters
                            context_lstm_cell_fw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            context_lstm_cell_bw = tf.contrib.rnn.LSTMCell(
                                options.context_lstm_dim)
                            if is_training:
                                context_lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_fw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))
                                context_lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(
                                    context_lstm_cell_bw,
                                    output_keep_prob=(1 -
                                                      options.dropout_rate))

                            # passage representation
                            ((passage_context_representation_fw,
                              passage_context_representation_bw),
                             (passage_forward, passage_backward
                              )) = tf.nn.bidirectional_dynamic_rnn(
                                  context_lstm_cell_fw,
                                  context_lstm_cell_bw,
                                  cur_in_passage_repres,
                                  dtype=tf.float32,
                                  sequence_length=self.passage_lengths
                              )  # [batch_size, passage_len, context_lstm_dim]
                            if options.direction == 'forward':
                                # [batch_size, passage_len, context_lstm_dim]
                                cur_in_passage_repres = passage_context_representation_fw
                                passage_dim += options.context_lstm_dim
                            elif options.direction == 'backward':
                                # [batch_size, passage_len, context_lstm_dim]
                                cur_in_passage_repres = passage_context_representation_bw
                                passage_dim += options.context_lstm_dim
                            elif options.direction == 'bidir':
                                # [batch_size, passage_len, 2*context_lstm_dim]
                                cur_in_passage_repres = tf.concat([
                                    passage_context_representation_fw,
                                    passage_context_representation_bw
                                ], 2)
                                passage_dim += 2 * options.context_lstm_dim
                            else:
                                assert False
                            all_passage_representation.append(
                                cur_in_passage_repres)

        all_passage_representation = tf.concat(
            all_passage_representation,
            2)  # [batch_size, passage_len, passage_dim]

        if is_training:
            all_passage_representation = tf.nn.dropout(
                all_passage_representation, (1 - options.dropout_rate))
        else:
            all_passage_representation = tf.multiply(
                all_passage_representation, (1 - options.dropout_rate))

        # ======Highway layer======
        if options.with_match_highway:
            with tf.variable_scope("context_highway"):
                all_passage_representation = match_utils.multi_highway_layer(
                    all_passage_representation, passage_dim,
                    options.highway_layer_num)

        all_passage_representation = all_passage_representation * tf.expand_dims(
            passage_mask, axis=-1)

        # initial state for the LSTM decoder
        #'''
        with tf.variable_scope('initial_state_for_decoder'):
            # Define weights and biases to reduce the cell and reduce the state
            w_reduce_c = tf.get_variable(
                'w_reduce_c',
                [2 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            w_reduce_h = tf.get_variable(
                'w_reduce_h',
                [2 * options.context_lstm_dim, options.gen_hidden_size],
                dtype=tf.float32)
            bias_reduce_c = tf.get_variable('bias_reduce_c',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)
            bias_reduce_h = tf.get_variable('bias_reduce_h',
                                            [options.gen_hidden_size],
                                            dtype=tf.float32)

            old_c = tf.concat(values=[passage_forward.c, passage_backward.c],
                              axis=1)
            old_h = tf.concat(values=[passage_forward.h, passage_backward.h],
                              axis=1)
            new_c = tf.nn.tanh(tf.matmul(old_c, w_reduce_c) + bias_reduce_c)
            new_h = tf.nn.tanh(tf.matmul(old_h, w_reduce_h) + bias_reduce_h)

            init_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
        '''
        new_c = tf.zeros([batch_size, options.gen_hidden_size])
        new_h = tf.zeros([batch_size, options.gen_hidden_size])
        init_state = LSTMStateTuple(new_c, new_h)
        '''
        return (passage_dim, all_passage_representation, init_state, memory)