Exemple #1
0
    def _build_graph(self):
        self.context_word = tf.placeholder(tf.int32, [None, None])
        self.context_char = tf.placeholder(tf.int32, [None, None, None])
        self.context_len = tf.placeholder(tf.int32, [None])
        self.question_word = tf.placeholder(tf.int32, [None, None])
        self.question_char = tf.placeholder(tf.int32, [None, None, None])
        self.question_len = tf.placeholder(tf.int32, [None])
        self.answer_start = tf.placeholder(tf.int32, [None])
        self.answer_end = tf.placeholder(tf.int32, [None])
        self.training = tf.placeholder(tf.bool, [])

        self.question_tokens = tf.placeholder(tf.string, [None, None])
        self.context_tokens = tf.placeholder(tf.string,[None,None])
        if self.enable_na_answer:
            self.na = tf.placeholder(tf.int32, [None])
        # 1. Word encoding
        word_embedding = Embedding(pretrained_embedding=self.pretrained_word_embedding,
                                   embedding_shape=(len(self.vocab.get_word_vocab()) + 1, self.word_embedding_size),
                                   trainable=self.word_embedding_trainable)
        char_embedding = Embedding(embedding_shape=(len(self.vocab.get_char_vocab()) + 1, self.char_embedding_size),
                                   trainable=True, init_scale=0.2)


        # 1.1 Embedding
        context_word_repr = word_embedding(self.context_word)
        context_char_repr = char_embedding(self.context_char)
        question_word_repr = word_embedding(self.question_word)
        question_char_repr = char_embedding(self.question_char)

        # 1.2 Char convolution
        dropout = Dropout(self.keep_prob)
        conv1d = Conv1DAndMaxPooling(self.char_conv_filters, self.char_conv_kernel_size)
        context_char_repr = dropout(conv1d(context_char_repr), self.training)
        question_char_repr = dropout(conv1d(question_char_repr), self.training)

        #elmo embedding
        if self.use_elmo:
            elmo_emb = ElmoEmbedding(local_path=self.elmo_local_path)
            context_elmo_repr = elmo_emb(self.context_tokens,self.context_len)
            context_elmo_repr = dropout(context_elmo_repr,self.training)
            question_elmo_repr = elmo_emb(self.question_tokens,self.question_len)
            question_elmo_repr = dropout(question_elmo_repr,self.training)
        #concat word and char
        context_repr = tf.concat([context_word_repr, context_char_repr],axis=-1)
        question_repr = tf.concat([question_word_repr,question_char_repr],axis=-1)
        if self.use_elmo:
            context_repr= tf.concat([context_repr,context_elmo_repr],axis=-1)
            question_repr = tf.concat([question_repr,question_elmo_repr],axis=-1)

        # 1.3 Highway network
        highway1 = Highway()
        highway2 = Highway()
        context_repr = highway2(highway1(context_repr))
        question_repr = highway2(highway1(question_repr))

        # 2. Phrase encoding
        phrase_lstm = CudnnBiLSTM(self.rnn_hidden_size)
        context_repr, _ = phrase_lstm(dropout(context_repr, self.training), self.context_len)
        question_repr, _ = phrase_lstm(dropout(question_repr, self.training), self.question_len)

        # 3. Bi-Attention
        bi_attention = BiAttention(TriLinear())
        c2q, q2c = bi_attention(context_repr, question_repr, self.context_len, self.question_len)

        # 4. Modeling layer
        final_merged_context = tf.concat([context_repr, c2q, context_repr * c2q, context_repr * q2c], axis=-1)
        modeling_lstm1 = CudnnBiLSTM(self.rnn_hidden_size)
        modeling_lstm2 = CudnnBiLSTM(self.rnn_hidden_size)
        modeled_context1, _ = modeling_lstm1(dropout(final_merged_context, self.training), self.context_len)
        modeled_context2, _ = modeling_lstm2(dropout(modeled_context1, self.training), self.context_len)
        modeled_context = modeled_context1 + modeled_context2

        # 5. Start prediction
        start_pred_layer = tf.keras.layers.Dense(1, use_bias=False)
        start_logits = start_pred_layer(
            dropout(tf.concat([final_merged_context, modeled_context], axis=-1), self.training))
        start_logits = tf.squeeze(start_logits, axis=-1)
        self.start_prob = masked_softmax(start_logits, self.context_len)

        # 6. End prediction
        start_repr = weighted_sum(modeled_context, self.start_prob)
        tiled_start_repr = tf.tile(tf.expand_dims(start_repr, axis=1), [1, tf.shape(modeled_context)[1], 1])
        end_lstm = CudnnBiLSTM(self.rnn_hidden_size)
        encoded_end_repr, _ = end_lstm(dropout(tf.concat(
            [final_merged_context, modeled_context, tiled_start_repr, modeled_context * tiled_start_repr], axis=-1),
            self.training),
            self.context_len)
        end_pred_layer = tf.keras.layers.Dense(1, use_bias=False)
        end_logits = end_pred_layer(dropout(tf.concat(
            [final_merged_context, encoded_end_repr], axis=-1), self.training))
        end_logits = tf.squeeze(end_logits, axis=-1)
        self.end_prob = masked_softmax(end_logits, self.context_len)

        # 7. Loss and input/output dict
        if self.enable_na_answer:
            self.na_bias = tf.get_variable("na_bias", shape=[1], dtype='float')
            self.na_bias_tiled = tf.tile(tf.reshape(self.na_bias, [1, 1]), [tf.shape(self.context_word)[0], 1])
            self.concat_start_na_logits = tf.concat([self.na_bias_tiled, start_logits], axis=-1)
            concat_start_na_prob = masked_softmax(self.concat_start_na_logits, self.context_len + 1)
            self.na_prob = tf.squeeze(tf.slice(concat_start_na_prob, [0, 0], [-1, 1]), axis=1)
            self.start_prob = tf.slice(concat_start_na_prob, [0, 1], [-1, -1])
            self.concat_end_na_logits = tf.concat([self.na_bias_tiled,end_logits],axis=-1)
            concat_end_na_prob = masked_softmax(self.concat_end_na_logits,self.context_len+1)
            self.na_prob2 =tf.squeeze(tf.slice(concat_end_na_prob,[0,0],[-1,1]),axis=1)
            self.end_prob = tf.slice(concat_end_na_prob,[0,1],[-1,-1])
            max_len =tf.reduce_max(self.context_len)
            start_label = tf.cast(tf.one_hot(self.answer_start,max_len),tf.float32)
            start_label =(1.0-tf.cast(tf.expand_dims(self.na,axis=-1),tf.float32))*start_label
            na = tf.cast(tf.expand_dims(self.na,axis=-1),tf.float32)
            start_na_label = tf.concat([na,start_label],axis=-1)
            self.start_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=mask_logits(self.concat_start_na_logits,self.context_len+1),
                    labels=start_na_label))
            end_label = tf.cast(tf.one_hot(self.answer_end,max_len),tf.float32)
            end_label = (1.0-tf.cast(tf.expand_dims(self.na,axis=-1),tf.float32))*end_label
            end_na_label = tf.concat([na,end_label],axis=-1)
            self.end_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=mask_logits(self.concat_end_na_logits,self.context_len+1),
                    labels=end_na_label))
        else:

            self.start_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mask_logits(start_logits, self.context_len),
                                                               labels=self.answer_start))
            self.end_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mask_logits(end_logits, self.context_len),
                                                               labels=self.answer_end))
        self.loss = self.start_loss + self.end_loss
        global_step = tf.train.get_or_create_global_step()
        input_dict = {
            "context_word": self.context_word,
            "context_char": self.context_char,
            "context_len": self.context_len,
            "question_word": self.question_word,
            "question_char": self.question_char,
            "question_len": self.question_len,
            "answer_start": self.answer_start,
            "answer_end": self.answer_end,
            "training": self.training
        }
        if self.use_elmo:
            input_dict['context_tokens'] = self.context_tokens
            input_dict['question_tokens'] = self.question_tokens
        if self.enable_na_answer:
            input_dict["is_impossible"] = self.na
        self.input_placeholder_dict = OrderedDict(input_dict)

        output_dict = {
            "start_prob": self.start_prob,
            "end_prob": self.end_prob
        }
        if self.enable_na_answer:
            output_dict['na_prob'] = self.na_prob*self.na_prob2

        self.output_variable_dict = OrderedDict(output_dict)

        # 8. Metrics and summary
        with tf.variable_scope("train_metrics"):
            self.train_metrics = {
                'loss': tf.metrics.mean(self.loss)
            }

        self.train_update_metrics = tf.group(*[op for _, op in self.train_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="train_metrics")
        self.train_metric_init_op = tf.variables_initializer(metric_variables)

        with tf.variable_scope("eval_metrics"):
            self.eval_metrics = {
                'loss': tf.metrics.mean(self.loss)
            }

        self.eval_update_metrics = tf.group(*[op for _, op in self.eval_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="eval_metrics")
        self.eval_metric_init_op = tf.variables_initializer(metric_variables)

        tf.summary.scalar('loss', self.loss)
        self.summary_op = tf.summary.merge_all()
Exemple #2
0
    def _build_graph(self):
        # build input
        self.training = tf.placeholder(tf.bool, [])
        self.context_word = tf.placeholder(tf.int32, [None, None])
        self.slice_context_len = tf.where(self.training, 400, 1000)
        self.slice_question_len = tf.where(self.training, 50, 100)
        self.context_char = tf.placeholder(tf.int32, [None, None, None])
        self.context_char = tf.placeholder(tf.int32, [None, None, None])
        batch_size, original_context_len, max_context_word_len = tf.shape(self.context_char)[0],tf.shape(self.context_char)[1],tf.shape(self.context_char)[2]

        slice_context_word = tf.slice(self.context_word,[0,0],[batch_size,tf.minimum(self.slice_context_len,original_context_len)])
        slice_context_char = tf.slice(self.context_char, [0, 0, 0], [batch_size,tf.minimum(self.slice_context_len,original_context_len),max_context_word_len])
        self.context_len = tf.placeholder(tf.int32, [None])
        self.question_word = tf.placeholder(tf.int32, [None, None])
        self.question_char = tf.placeholder(tf.int32, [None, None, None])
        original_question_len,max_question_word_len = tf.shape(self.question_char)[1],tf.shape(self.question_char)[2]
        slice_question_word = tf.slice(self.question_word,[0,0],[batch_size,tf.minimum(self.slice_question_len,original_question_len)])
        slice_question_char = tf.slice(self.question_char, [0, 0, 0],[batch_size, tf.minimum(self.slice_question_len, original_question_len),max_question_word_len])
        self.question_len = tf.placeholder(tf.int32, [None])
        self.answer_start = tf.placeholder(tf.int32, [None])
        self.answer_end = tf.placeholder(tf.int32, [None])

        max_context_len = tf.shape(slice_context_word)[1]
        max_question_len = tf.shape(slice_question_word)[1]
        slice_context_len = tf.clip_by_value(self.context_len,0,max_context_len)
        slice_question_len = tf.clip_by_value(self.question_len,0,max_question_len)
        context_mask = (tf.sequence_mask(slice_context_len, max_context_len, dtype=tf.float32)-1)*100
        question_mask = (tf.sequence_mask(slice_question_len, max_question_len, dtype=tf.float32)-1) * 100
        divisors = tf.pow(tf.constant([10000.0] * (self.filters // 2), dtype=tf.float32),tf.range(0, self.filters, 2, dtype=tf.float32) / self.filters)
        quotients = tf.cast(tf.expand_dims(tf.range(0, max_context_len), -1),tf.float32) / tf.expand_dims(divisors, 0)
        position_repr = tf.concat([tf.sin(quotients), tf.cos(quotients)], -1) #  CL*F
        # 1. Word encoding
        word_embedding = Embedding(pretrained_embedding=self.pretrained_word_embedding,
                                   embedding_shape=(len(self.vocab.get_word_vocab()) + 1, self.word_embedding_size),
                                   trainable=self.word_embedding_trainable)
        char_embedding = Embedding(embedding_shape=(len(self.vocab.get_char_vocab()) + 1, self.char_embedding_size),
                                   trainable=True, init_scale=0.2)
        dropout = Dropout(self.keep_prob)
        # 1.1 Embedding
        context_word_embedding = Dropout(0.9)(word_embedding(slice_context_word),self.training) # B*CL*WD
        context_char_embedding = Dropout(0.95)(char_embedding(slice_context_char),self.training) # B*CL*WL*CD
        question_word_embedding = Dropout(0.9)(word_embedding(slice_question_word),self.training) # B*QL*WD
        question_char_embedding = Dropout(0.95)(char_embedding(slice_question_char),self.training) # B*QL*WL*CD

        char_cnn = Conv1DAndMaxPooling(self.char_filters,self.kernel_size1,padding='same',activation=tf.nn.relu)
        embedding_dense = tf.keras.layers.Dense(self.filters)
        highway = Highway(gate_activation=tf.nn.relu,trans_activation=tf.nn.tanh,hidden_units=self.filters,keep_prob=self.keep_prob)

        context_char_repr = tf.reshape(char_cnn(context_char_embedding),[-1,max_context_len,self.char_filters]) # B*CL*CF
        context_repr = highway(dropout(embedding_dense(tf.concat([context_word_embedding,context_char_repr],-1)), self.training),self.training) # B*CL*F

        question_char_repr = tf.reshape(char_cnn(question_char_embedding),[-1,max_question_len,self.char_filters]) # B*QL*CF
        question_repr = highway(dropout(embedding_dense(tf.concat([question_word_embedding,question_char_repr],-1)), self.training),self.training) # B*QL*CF

        # 1.2 Embedding Encoder
        embedding_encoder = EncoderBlock(self.kernel_size1,self.filters,self.conv_layers1,self.heads,self.keep_prob)
        embedding_context = tf.contrib.layers.layer_norm(embedding_encoder(context_repr + position_repr,self.training,context_mask), begin_norm_axis=-1)  # B*CL*F
        embedding_question = tf.contrib.layers.layer_norm(embedding_encoder(question_repr + position_repr[:max_question_len], self.training,question_mask), begin_norm_axis=-1)  # B*QL*F

        # 1.3 co-attention
        co_attention_context = tf.keras.layers.Dense(1)(embedding_context)  # B*CL*1
        co_attention_question = tf.keras.layers.Dense(1)(embedding_question)  # B*QL*1
        cq = tf.matmul(tf.expand_dims(tf.transpose(embedding_context, [0, 2, 1]), -1),tf.expand_dims(tf.transpose(embedding_question, [0, 2, 1]), -2))  # B*F*CL*QL
        cq_score = tf.keras.layers.Dense(1)(tf.transpose(cq, [0, 2, 3, 1]))[:, :, :, 0] + co_attention_context + tf.transpose(co_attention_question, [0, 2, 1])  # B*CL*QL
        question_similarity = tf.nn.softmax(cq_score + tf.expand_dims(question_mask, -2),2)  # B*CL*QL
        context_similarity = tf.nn.softmax(cq_score + tf.expand_dims(context_mask, -1), 1)  # B*CL*QL
        cqa = tf.matmul(question_similarity, embedding_question)  #  B*CL*F
        qca = tf.matmul(question_similarity, tf.matmul(context_similarity, embedding_context, transpose_a=True))  #  B*CL*F
        co_attention_output = dropout(tf.keras.layers.Dense(self.filters)(tf.concat([embedding_context, cqa, embedding_context * cqa, embedding_context * qca], -1)), self.training)  #  B*CL*F

        # 1.4 Model Encoder
        model_encoder_blocks=[EncoderBlock(self.kernel_size2,self.filters,self.conv_layers2,self.heads,self.keep_prob) for _ in range(self.model_encoder_layers)]
        m0 = co_attention_output
        for model_encoder_block in model_encoder_blocks:
            m0 = model_encoder_block(m0 + position_repr,self.training,context_mask)
        m1 = m0
        for model_encoder_block in model_encoder_blocks:
            m1 = model_encoder_block(m1 + position_repr,self.training,context_mask)
        m2 = m1
        for model_encoder_block in model_encoder_blocks:
            m2 = model_encoder_block(m2 + position_repr,self.training,context_mask)
        norm_m0 = tf.contrib.layers.layer_norm(m0, begin_norm_axis=-1)
        norm_m1 = tf.contrib.layers.layer_norm(m1, begin_norm_axis=-1)
        norm_m2 = tf.contrib.layers.layer_norm(m2, begin_norm_axis=-1)
        # predict start

        start_logits = tf.keras.layers.Dense(1)(tf.concat([norm_m0, norm_m1], -1))[:, :, 0] + context_mask
        self.start_prob = tf.nn.softmax(start_logits,-1)
        self.start_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=start_logits,labels=tf.one_hot(self.answer_start,max_context_len)))

        # predict end
        end_logits = tf.keras.layers.Dense(1)(tf.concat([norm_m0, norm_m2], -1))[:, :, 0] + context_mask
        self.end_prob = tf.nn.softmax(end_logits,-1)
        self.end_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=end_logits, labels=tf.one_hot(self.answer_end,max_context_len)))

        self.loss = self.start_loss + self.end_loss
        self.global_step = tf.train.get_or_create_global_step()
        input_dict = {
            "context_word": self.context_word,
            "context_char": self.context_char,
            "context_len": self.context_len,
            "question_word": self.question_word,
            "question_char": self.question_char,
            "question_len": self.question_len,
            "answer_start": self.answer_start,
            "answer_end": self.answer_end,
            "training": self.training
        }


        self.input_placeholder_dict = OrderedDict(input_dict)
        print(self.input_placeholder_dict)# = OrderedDict(input_dict)

        self.output_variable_dict = OrderedDict({
            "start_prob": self.start_prob,
            "end_prob": self.end_prob
        })

        # 8. Metrics and summary
        with tf.variable_scope("train_metrics"):
            self.train_metrics = {
                'loss': tf.metrics.mean(self.loss)
            }

        self.train_update_metrics = tf.group(*[op for _, op in self.train_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="train_metrics")
        self.train_metric_init_op = tf.variables_initializer(metric_variables)

        with tf.variable_scope("eval_metrics"):
            self.eval_metrics = {
                'loss': tf.metrics.mean(self.loss)
            }

        self.eval_update_metrics = tf.group(*[op for _, op in self.eval_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="eval_metrics")
        self.eval_metric_init_op = tf.variables_initializer(metric_variables)

        tf.summary.scalar('loss', self.loss)
        self.summary_op = tf.summary.merge_all()
    def _build_graph(self):
        self.context_word = tf.placeholder(tf.int32, [None, None])
        self.context_len = tf.placeholder(tf.int32, [None])

        self.context_char = tf.placeholder(tf.int32, [None, None, None])
        self.context_word_len = tf.placeholder(tf.int32, [None, None])

        self.question_word = tf.placeholder(tf.int32, [None, None])
        self.question_len = tf.placeholder(tf.int32, [None])

        self.question_char = tf.placeholder(tf.int32, [None, None, None])
        self.question_word_len = tf.placeholder(tf.int32, [None, None])

        self.answer_start = tf.placeholder(tf.int32, [None])
        self.answer_end = tf.placeholder(tf.int32, [None])
        self.abstractive_answer_mask = tf.placeholder(tf.int32, [None, self.abstractive_answer_num])
        self.training = tf.placeholder(tf.bool, [])
        
        self.question_tokens = tf.placeholder(tf.string, [None, None])
        self.context_tokens = tf.placeholder(tf.string,[None,None])

        # 1. Word encoding
        word_embedding = Embedding(pretrained_embedding=self.pretrained_word_embedding,
                                   embedding_shape=(len(self.vocab.get_word_vocab()) + 1, self.word_embedding_size),
                                   trainable=self.word_embedding_trainable)
        char_embedding = Embedding(embedding_shape=(len(self.vocab.get_char_vocab()) + 1, self.char_embedding_size), trainable=True, init_scale=0.05)

        # 1.1 Embedding
        dropout = Dropout(self.keep_prob)
        context_word_repr = word_embedding(self.context_word)
        context_char_repr = char_embedding(self.context_char)
        question_word_repr = word_embedding(self.question_word)
        question_char_repr = char_embedding(self.question_char)
        if self.use_elmo:
            elmo_emb = ElmoEmbedding(local_path=self.elmo_local_path)
            context_elmo_repr = elmo_emb(self.context_tokens,self.context_len)
            context_elmo_repr = dropout(context_elmo_repr,self.training)
            question_elmo_repr = elmo_emb(self.question_tokens,self.question_len)
            question_elmo_repr = dropout(question_elmo_repr,self.training)

        # 1.2 Char convolution
        conv1d = Conv1DAndMaxPooling(self.char_conv_filters, self.char_conv_kernel_size)
        if self.max_pooling_mask:
            question_char_repr = conv1d(dropout(question_char_repr, self.training), self.question_word_len)
            context_char_repr = conv1d(dropout(context_char_repr, self.training), self.context_word_len)
        else:
            question_char_repr = conv1d(dropout(question_char_repr, self.training))
            context_char_repr = conv1d(dropout(context_char_repr, self.training))

        # 2. Phrase encoding
        context_embs = [context_word_repr, context_char_repr]
        question_embs = [question_word_repr, question_char_repr]
        if self.use_elmo:
            context_embs.append(context_elmo_repr)
            question_embs.append(question_elmo_repr)

        context_repr = tf.concat(context_embs, axis=-1)
        question_repr = tf.concat(question_embs, axis=-1)

        variational_dropout = VariationalDropout(self.keep_prob)
        emb_enc_gru = CudnnBiGRU(self.rnn_hidden_size)

        context_repr = variational_dropout(context_repr, self.training)
        context_repr, _ = emb_enc_gru(context_repr, self.context_len)
        context_repr = variational_dropout(context_repr,  self.training)

        question_repr = variational_dropout(question_repr, self.training)
        question_repr, _ = emb_enc_gru(question_repr, self.question_len)
        question_repr = variational_dropout(question_repr, self.training)

        # 3. Bi-Attention
        bi_attention = BiAttention(TriLinear(bias=True, name="bi_attention_tri_linear"))
        c2q, q2c = bi_attention(context_repr, question_repr, self.context_len, self.question_len)
        context_repr = tf.concat([context_repr, c2q, context_repr * c2q, context_repr * q2c], axis=-1)

        # 4. Self-Attention layer
        dense1 = tf.keras.layers.Dense(self.rnn_hidden_size*2, use_bias=True, activation=tf.nn.relu)
        gru = CudnnBiGRU(self.rnn_hidden_size)
        dense2 = tf.keras.layers.Dense(self.rnn_hidden_size*2, use_bias=True, activation=tf.nn.relu)
        self_attention = SelfAttention(TriLinear(bias=True, name="self_attention_tri_linear"))

        inputs = dense1(context_repr)
        outputs = variational_dropout(inputs, self.training)
        outputs, _ = gru(outputs, self.context_len)
        outputs = variational_dropout(outputs, self.training)
        c2c = self_attention(outputs, self.context_len)
        outputs = tf.concat([c2c, outputs, c2c * outputs], axis=len(c2c.shape)-1)
        outputs = dense2(outputs)
        context_repr = inputs + outputs
        context_repr = variational_dropout(context_repr, self.training)
        
        # 5. Modeling layer
        sum_max_encoding = SumMaxEncoder()
        context_modeling_gru1 = CudnnBiGRU(self.rnn_hidden_size)
        context_modeling_gru2 = CudnnBiGRU(self.rnn_hidden_size)
        question_modeling_gru1 = CudnnBiGRU(self.rnn_hidden_size)
        question_modeling_gru2 = CudnnBiGRU(self.rnn_hidden_size)
        self.max_context_len = tf.reduce_max(self.context_len)
        self.max_question_len = tf.reduce_max(self.question_len)

        modeled_context1, _ = context_modeling_gru1(context_repr, self.context_len)
        modeled_context2, _ = context_modeling_gru2(tf.concat([context_repr, modeled_context1], axis=2), self.context_len)
        encoded_context = sum_max_encoding(modeled_context1, self.context_len, self.max_context_len)
        modeled_question1, _ = question_modeling_gru1(question_repr, self.question_len)
        modeled_question2, _ = question_modeling_gru2(tf.concat([question_repr, modeled_question1], axis=2), self.question_len)
        encoded_question = sum_max_encoding(modeled_question2, self.question_len, self.max_question_len)
        
        # 6. Predictions
        start_dense = tf.keras.layers.Dense(1, activation=None)
        start_logits = tf.squeeze(start_dense(modeled_context1), squeeze_dims=[2])
        start_logits = mask_logits(start_logits, self.context_len)

        end_dense = tf.keras.layers.Dense(1, activation=None)
        end_logits = tf.squeeze(end_dense(modeled_context2), squeeze_dims=[2])
        end_logits = mask_logits(end_logits, self.context_len)

        abstractive_answer_logits = None
        if self.abstractive_answer_num != 0:
            abstractive_answer_logits = []
            for i in range(self.abstractive_answer_num):
                tri_linear = TriLinear(name="cls"+str(i))
                abstractive_answer_logits.append(tf.squeeze(tri_linear(encoded_context, encoded_question), squeeze_dims=[2]))
            abstractive_answer_logits = tf.concat(abstractive_answer_logits, axis=-1)

        # 7. Loss and input/output dict
        seq_length = tf.shape(start_logits)[1]
        start_mask = tf.one_hot(self.answer_start, depth=seq_length, dtype=tf.float32) 
        end_mask = tf.one_hot(self.answer_end, depth=seq_length, dtype=tf.float32) 
        if self.abstractive_answer_num != 0:
            abstractive_answer_mask = tf.cast(self.abstractive_answer_mask, dtype=tf.float32)
            extractive_mask = 1. - tf.reduce_max(abstractive_answer_mask, axis=-1, keepdims=True)
            start_mask = extractive_mask * start_mask
            end_mask = extractive_mask * end_mask

            concated_start_masks = tf.concat([start_mask, abstractive_answer_mask], axis=1)
            concated_end_masks = tf.concat([end_mask, abstractive_answer_mask], axis=1)

            concated_start_logits = tf.concat([start_logits, abstractive_answer_logits], axis=1)
            concated_end_logits = tf.concat([end_logits, abstractive_answer_logits], axis=1)
        else:
            concated_start_masks = start_mask
            concated_end_masks = end_mask

            concated_start_logits = start_logits
            concated_end_logits = end_logits

        start_log_norm = tf.reduce_logsumexp(concated_start_logits, axis=1)
        start_log_score = tf.reduce_logsumexp(concated_start_logits + VERY_NEGATIVE_NUMBER * (1- tf.cast(concated_start_masks, tf.float32)), axis=1)
        self.start_loss = tf.reduce_mean(-(start_log_score - start_log_norm))

        end_log_norm = tf.reduce_logsumexp(concated_end_logits, axis=1)
        end_log_score = tf.reduce_logsumexp(concated_end_logits + VERY_NEGATIVE_NUMBER * (1- tf.cast(concated_end_masks, tf.float32)), axis=1)
        self.end_loss = tf.reduce_mean(-(end_log_score - end_log_norm))

        self.loss = self.start_loss + self.end_loss
        global_step = tf.train.get_or_create_global_step()

        self.input_placeholder_dict = OrderedDict({
            "context_word": self.context_word,
            "question_word": self.question_word,
            "context_char": self.context_char,
            "question_char": self.question_char,
            "context_len": self.context_len,
            "question_len": self.question_len,
            "answer_start": self.answer_start,
            "answer_end": self.answer_end,
            "training": self.training
        })
        if self.max_pooling_mask:
            self.input_placeholder_dict['context_word_len'] = self.context_word_len
            self.input_placeholder_dict['question_word_len'] = self.question_word_len
        if self.use_elmo:
            self.input_placeholder_dict['context_tokens'] = self.context_tokens
            self.input_placeholder_dict['question_tokens'] = self.question_tokens
        if self.abstractive_answer_num != 0:
            self.input_placeholder_dict["abstractive_answer_mask"] = self.abstractive_answer_mask

        self.output_variable_dict = OrderedDict({
            "start_logits": start_logits,
            "end_logits": end_logits,
        })
        if self.abstractive_answer_num != 0:
            self.output_variable_dict["abstractive_answer_logits"] = abstractive_answer_logits

        # 8. Metrics and summary
        with tf.variable_scope("train_metrics"):
            self.train_metrics = {
                'loss': tf.metrics.mean(self.loss)
            }

        self.train_update_metrics = tf.group(*[op for _, op in self.train_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="train_metrics")
        self.train_metric_init_op = tf.variables_initializer(metric_variables)

        with tf.variable_scope("eval_metrics"):
            self.eval_metrics = {
                'loss': tf.metrics.mean(self.loss)
            }

        self.eval_update_metrics = tf.group(*[op for _, op in self.eval_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="eval_metrics")
        self.eval_metric_init_op = tf.variables_initializer(metric_variables)

        tf.summary.scalar('loss', self.loss)
        self.summary_op = tf.summary.merge_all()
Exemple #4
0
    def _build_graph(self):
        self.context_word = tf.placeholder(tf.int32, [None, None])
        self.context_len = tf.placeholder(tf.int32, [None])

        self.context_char = tf.placeholder(tf.int32, [None, None, None])
        self.context_word_len = tf.placeholder(tf.int32, [None, None])

        self.question_word = tf.placeholder(tf.int32, [None, None])
        self.question_len = tf.placeholder(tf.int32, [None])

        self.question_char = tf.placeholder(tf.int32, [None, None, None])
        self.question_word_len = tf.placeholder(tf.int32, [None, None])

        self.answer_start = tf.placeholder(tf.int32, [None])
        self.answer_end = tf.placeholder(tf.int32, [None])
        self.training = tf.placeholder(tf.bool, [])

        # 1. Word encoding
        word_embedding = Embedding(
            pretrained_embedding=self.pretrained_word_embedding,
            embedding_shape=(len(self.vocab.get_word_vocab()) + 1,
                             self.word_embedding_size),
            trainable=self.word_embedding_trainable)
        char_embedding = Embedding(
            embedding_shape=(len(self.vocab.get_char_vocab()) + 1,
                             self.char_embedding_size),
            trainable=True,
            init_scale=0.05)

        # 1.1 Embedding
        context_word_repr = word_embedding(self.context_word)
        context_char_repr = char_embedding(self.context_char)
        question_word_repr = word_embedding(self.question_word)
        question_char_repr = char_embedding(self.question_char)

        self.context_word_repr = context_word_repr
        self.question_word_repr = question_word_repr
        self.context_char_repr = context_char_repr
        self.question_char_repr = question_char_repr

        # 1.2 Char convolution
        dropout = Dropout(self.keep_prob)
        conv1d = Conv1DAndMaxPooling(self.char_conv_filters,
                                     self.char_conv_kernel_size)
        question_char_repr = conv1d(dropout(question_char_repr, self.training),
                                    self.question_word_len)
        context_char_repr = conv1d(dropout(context_char_repr, self.training),
                                   self.context_word_len)

        # 2. Phrase encoding
        context_repr = tf.concat([context_word_repr, context_char_repr],
                                 axis=-1)
        question_repr = tf.concat([question_word_repr, question_char_repr],
                                  axis=-1)

        variational_dropout = VariationalDropout(self.keep_prob)
        emb_enc_gru = CudnnBiGRU(self.rnn_hidden_size)

        context_repr = variational_dropout(context_repr, self.training)
        context_repr, _ = emb_enc_gru(context_repr, self.context_len)
        context_repr = variational_dropout(context_repr, self.training)

        question_repr = variational_dropout(question_repr, self.training)
        question_repr, _ = emb_enc_gru(question_repr, self.question_len)
        question_repr = variational_dropout(question_repr, self.training)

        # 3. Bi-Attention
        bi_attention = BiAttention(
            TriLinear(bias=True, name="bi_attention_tri_linear"))
        c2q, q2c = bi_attention(context_repr, question_repr, self.context_len,
                                self.question_len)
        context_repr = tf.concat(
            [context_repr, c2q, context_repr * c2q, context_repr * q2c],
            axis=-1)

        # 4. Self-Attention layer
        dense1 = tf.keras.layers.Dense(self.rnn_hidden_size * 2,
                                       use_bias=True,
                                       activation=tf.nn.relu)
        gru = CudnnBiGRU(self.rnn_hidden_size)
        dense2 = tf.keras.layers.Dense(self.rnn_hidden_size * 2,
                                       use_bias=True,
                                       activation=tf.nn.relu)
        self_attention = SelfAttention(
            TriLinear(bias=True, name="self_attention_tri_linear"))

        inputs = dense1(context_repr)
        outputs = variational_dropout(inputs, self.training)
        outputs, _ = gru(outputs, self.context_len)
        outputs = variational_dropout(outputs, self.training)
        c2c = self_attention(outputs, self.context_len)
        outputs = tf.concat([c2c, outputs, c2c * outputs],
                            axis=len(c2c.shape) - 1)
        outputs = dense2(outputs)
        context_repr = inputs + outputs
        context_repr = variational_dropout(context_repr, self.training)

        # 5. Modeling layer
        sum_max_encoding = SumMaxEncoder()
        context_modeling_gru1 = CudnnBiGRU(self.rnn_hidden_size)
        context_modeling_gru2 = CudnnBiGRU(self.rnn_hidden_size)
        question_modeling_gru1 = CudnnBiGRU(self.rnn_hidden_size)
        question_modeling_gru2 = CudnnBiGRU(self.rnn_hidden_size)
        self.max_context_len = tf.reduce_max(self.context_len)
        self.max_question_len = tf.reduce_max(self.question_len)

        modeled_context1, _ = context_modeling_gru1(context_repr,
                                                    self.context_len)
        modeled_context2, _ = context_modeling_gru2(
            tf.concat([context_repr, modeled_context1], axis=2),
            self.context_len)
        encoded_context = sum_max_encoding(modeled_context1, self.context_len,
                                           self.max_context_len)
        modeled_question1, _ = question_modeling_gru1(question_repr,
                                                      self.question_len)
        modeled_question2, _ = question_modeling_gru2(
            tf.concat([question_repr, modeled_question1], axis=2),
            self.question_len)
        encoded_question = sum_max_encoding(modeled_question2,
                                            self.question_len,
                                            self.max_question_len)

        start_dense = tf.keras.layers.Dense(1, activation=None)
        start_logits = tf.squeeze(start_dense(modeled_context1),
                                  squeeze_dims=[2])
        start_logits = mask_logits(start_logits, self.context_len)
        start_prob = tf.nn.softmax(start_logits)

        end_dense = tf.keras.layers.Dense(1, activation=None)
        end_logits = tf.squeeze(end_dense(modeled_context2), squeeze_dims=[2])
        end_logits = mask_logits(end_logits, self.context_len)
        end_prob = tf.nn.softmax(end_logits)

        self.start_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=start_logits, labels=self.answer_start))
        self.end_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=end_logits, labels=self.answer_end))
        # 7. Loss and input/output dict

        self.loss = self.start_loss + self.end_loss
        global_step = tf.train.get_or_create_global_step()

        self.input_placeholder_dict = OrderedDict({
            "context_word": self.context_word,
            "context_len": self.context_len,
            "context_char": self.context_char,
            "context_word_len": self.context_word_len,
            "question_word": self.question_word,
            "question_len": self.question_len,
            "question_char": self.question_char,
            "question_word_len": self.question_word_len,
            "answer_start": self.answer_start,
            "answer_end": self.answer_end,
            "training": self.training
        })

        self.output_variable_dict = OrderedDict({
            "start_prob": start_prob,
            "end_prob": end_prob,
        })

        # 8. Metrics and summary
        with tf.variable_scope("train_metrics"):
            self.train_metrics = {'loss': tf.metrics.mean(self.loss)}

        self.train_update_metrics = tf.group(
            *[op for _, op in self.train_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                             scope="train_metrics")
        self.train_metric_init_op = tf.variables_initializer(metric_variables)

        with tf.variable_scope("eval_metrics"):
            self.eval_metrics = {'loss': tf.metrics.mean(self.loss)}

        self.eval_update_metrics = tf.group(
            *[op for _, op in self.eval_metrics.values()])
        metric_variables = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                             scope="eval_metrics")
        self.eval_metric_init_op = tf.variables_initializer(metric_variables)

        tf.summary.scalar('loss', self.loss)
        self.summary_op = tf.summary.merge_all()