Example #1
0
    def create_model_graph(self, num_classes, word_vocab=None, char_vocab=None, is_training=True, global_step=None):
        options = self.options
        # ======word representation layer======
        in_question_repres = [] # word and char
        in_passage_repres = [] # word and char
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable("word_embedding", trainable=word_vec_trainable, 
                                                  initializer=tf.constant(word_vocab.word_vecs), dtype=tf.float32)

            in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_question_words) # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(self.word_embedding, self.in_passage_words) # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim
            
        if options.with_char and char_vocab is not None:
            input_shape = tf.shape(self.in_question_chars)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            q_char_len = input_shape[2]
            input_shape = tf.shape(self.in_passage_chars)
            passage_len = input_shape[1]
            p_char_len = input_shape[2]
            char_dim = char_vocab.word_dim
            self.char_embedding = tf.get_variable("char_embedding", initializer=tf.constant(char_vocab.word_vecs), dtype=tf.float32)

            in_question_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_question_chars) # [batch_size, question_len, q_char_len, char_dim]
            in_question_char_repres = tf.reshape(in_question_char_repres, shape=[-1, q_char_len, char_dim])
            question_char_lengths = tf.reshape(self.question_char_lengths, [-1])
            quesiton_char_mask = tf.sequence_mask(question_char_lengths, q_char_len, dtype=tf.float32)  # [batch_size*question_len, q_char_len]
            in_question_char_repres = tf.multiply(in_question_char_repres, tf.expand_dims(quesiton_char_mask, axis=-1))


            in_passage_char_repres = tf.nn.embedding_lookup(self.char_embedding, self.in_passage_chars) # [batch_size, passage_len, p_char_len, char_dim]
            in_passage_char_repres = tf.reshape(in_passage_char_repres, shape=[-1, p_char_len, char_dim])
            passage_char_lengths = tf.reshape(self.passage_char_lengths, [-1])
            passage_char_mask = tf.sequence_mask(passage_char_lengths, p_char_len, dtype=tf.float32)  # [batch_size*passage_len, p_char_len]
            in_passage_char_repres = tf.multiply(in_passage_char_repres, tf.expand_dims(passage_char_mask, axis=-1))

            (question_char_outputs_fw, question_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_question_char_repres, options.char_lstm_dim,
                    input_lengths=question_char_lengths,scope_name="char_lstm", reuse=False,
                    is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
            question_char_outputs_fw = layer_utils.collect_final_step_of_lstm(question_char_outputs_fw, question_char_lengths - 1)
            question_char_outputs_bw = question_char_outputs_bw[:, 0, :]
            question_char_outputs = tf.concat(axis=1, values=[question_char_outputs_fw, question_char_outputs_bw])
            question_char_outputs = tf.reshape(question_char_outputs, [batch_size, question_len, 2*options.char_lstm_dim])

            (passage_char_outputs_fw, passage_char_outputs_bw, _) = layer_utils.my_lstm_layer(in_passage_char_repres, options.char_lstm_dim,
                    input_lengths=passage_char_lengths, scope_name="char_lstm", reuse=True,
                    is_training=is_training, dropout_rate=options.dropout_rate, use_cudnn=options.use_cudnn)
            passage_char_outputs_fw = layer_utils.collect_final_step_of_lstm(passage_char_outputs_fw, passage_char_lengths - 1)
            passage_char_outputs_bw = passage_char_outputs_bw[:, 0, :]
            passage_char_outputs = tf.concat(axis=1, values=[passage_char_outputs_fw, passage_char_outputs_bw])
            passage_char_outputs = tf.reshape(passage_char_outputs, [batch_size, passage_len, 2*options.char_lstm_dim])
                
            in_question_repres.append(question_char_outputs)
            in_passage_repres.append(passage_char_outputs)

            input_dim += 2*options.char_lstm_dim

        in_question_repres = tf.concat(axis=2, values=in_question_repres) # [batch_size, question_len, dim] # concat word and char
        in_passage_repres = tf.concat(axis=2, values=in_passage_repres) # [batch_size, passage_len, dim] # concat word and char

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres, (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres, (1 - options.dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths, passage_len, dtype=tf.float32) # [batch_size, passage_len]
        question_mask = tf.sequence_mask(self.question_lengths, question_len, dtype=tf.float32) # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(in_passage_repres, input_dim, options.highway_layer_num)

        # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1))
        # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1))

        # ========Bilateral Matching=====
        (match_representation, match_dim) = match_utils.bilateral_match_func(in_question_repres, in_passage_repres,
                        self.question_lengths, self.passage_lengths, question_mask, mask, input_dim, is_training, options=options)

        #========Prediction Layer=========
        # match_dim = 4 * self.options.aggregation_lstm_dim
        w_0 = tf.get_variable("w_0", [match_dim, match_dim/2], dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim/2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim/2, num_classes],dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes],dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.tanh(logits)
        if is_training: logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        
        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        self.predictions = tf.argmax(self.prob, 1)

        if not is_training: return

        tvars = tf.trainable_variables()
        if self.options.lambda_l2>0.0:
            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            self.loss = self.loss + self.options.lambda_l2 * l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
    def create_siameseLSTM_model_graph(self,
                                       num_classes,
                                       word_vocab=None,
                                       char_vocab=None,
                                       is_training=True,
                                       global_step=None):
        """
        """
        options = self.options
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.embedding = tf.placeholder(
                    tf.float32, shape=word_vocab.word_vecs.shape)
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=self.embedding,
                    dtype=tf.float32)  # tf.constant(word_vocab.word_vecs)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        in_question_repres = tf.concat(
            axis=2,
            values=in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            axis=2, values=in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        passage_mask = tf.sequence_mask(
            self.passage_lengths, passage_len,
            dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        # ======BiLSTM context layer======
        for i in range(
                options.context_layer_num):  # support multiple context layer
            with tf.variable_scope('bilstm-layer-{}'.format(i)):
                # contextual lstm for both passage and question
                in_question_repres = tf.multiply(
                    in_question_repres, tf.expand_dims(question_mask, axis=-1))
                (question_context_representation_fw,
                 question_context_representation_bw,
                 in_question_repres) = layer_utils.my_lstm_layer(
                     in_question_repres,
                     options.context_lstm_dim,
                     input_lengths=self.question_lengths,
                     scope_name="context_represent",
                     reuse=False,
                     is_training=is_training,
                     dropout_rate=options.dropout_rate,
                     use_cudnn=options.use_cudnn)

                # Encode the second sentence, using the same LSTM weights.
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = tf.multiply(
                    in_passage_repres, tf.expand_dims(passage_mask, axis=-1))
                (passage_context_representation_fw,
                 passage_context_representation_bw,
                 in_passage_repres) = layer_utils.my_lstm_layer(
                     in_passage_repres,
                     options.context_lstm_dim,
                     input_lengths=self.passage_lengths,
                     scope_name="context_represent",
                     reuse=True,
                     is_training=is_training,
                     dropout_rate=options.dropout_rate,
                     use_cudnn=options.use_cudnn)

        if options.lstm_out_type == 'mean':
            question_context_representation_fw = layer_utils.collect_mean_step_of_lstm(
                question_context_representation_fw)
            question_context_representation_bw = layer_utils.collect_mean_step_of_lstm(
                question_context_representation_bw)
            passage_context_representation_fw = layer_utils.collect_mean_step_of_lstm(
                passage_context_representation_fw)
            passage_context_representation_bw = layer_utils.collect_mean_step_of_lstm(
                passage_context_representation_bw)
        elif options.lstm_out_type == 'end':
            question_context_representation_fw = layer_utils.collect_final_step_of_lstm(
                question_context_representation_fw, self.question_lengths - 1)
            question_context_representation_bw = question_context_representation_bw[:,
                                                                                    0, :]
            passage_context_representation_fw = layer_utils.collect_final_step_of_lstm(
                passage_context_representation_fw, self.passage_lengths - 1)
            passage_context_representation_bw = passage_context_representation_bw[:,
                                                                                  0, :]

        question_context_outputs = tf.concat(
            axis=1,
            values=[
                question_context_representation_fw,
                question_context_representation_bw
            ])
        passage_context_outputs = tf.concat(
            axis=1,
            values=[
                passage_context_representation_fw,
                passage_context_representation_bw
            ])

        (match_representation, match_dim) = match_utils.siameseLSTM_match_func(
            question_context_outputs, passage_context_outputs,
            options.context_lstm_dim)

        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.nn.relu(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        self.predictions = tf.argmax(self.prob, 1)

        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))

        if not is_training: return

        tvars = tf.trainable_variables()
        if self.options.lambda_l1 > 0.0:
            l1_loss = tf.add_n([
                tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l1_loss
        if self.options.lambda_l2 > 0.0:
            # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            l2_loss = tf.add_n([
                tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(
                learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(
                MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
Example #3
0
    def create_model_graph(self,
                           num_classes,
                           word_vocab=None,
                           char_vocab=None,
                           lemma_vocab=None,
                           is_training=True,
                           global_step=None):
        options = self.options
        # ======word representation layer======
        with tf.variable_scope("Input_Embedding_Layer"):
            if word_vocab is not None:
                word_vec_trainable = True
                cur_device = '/gpu:0'
                if options.fix_word_vec:
                    word_vec_trainable = False
                    cur_device = '/cpu:0'
                with tf.device(cur_device):
                    self.word_embedding = tf.get_variable(
                        "word_embedding",
                        trainable=word_vec_trainable,
                        initializer=tf.constant(word_vocab.word_vecs),
                        dtype=tf.float32)

                    # self.kg_embedding = tf.get_variable("kg", trainable=True, regularizer=regularizer,
                    #                                     initializer=tf.constant(lemma_vocab.word_vecs), dtype=tf.float32)
                    self.kg_embedding = tf.get_variable(
                        "kg",
                        shape=(lemma_vocab.word_vecs.shape[0], options.kg_dim),
                        initializer=initializer,
                        trainable=True,
                        dtype=tf.float32)

            c_emb = tf.nn.embedding_lookup(self.word_embedding,
                                           self.in_passage_words)
            q_emb = tf.nn.embedding_lookup(self.word_embedding,
                                           self.in_question_words)
            c_kg_emb = tf.nn.embedding_lookup(self.kg_embedding,
                                              self.in_passage_words_lemma)
            q_kg_emb = tf.nn.embedding_lookup(self.kg_embedding,
                                              self.in_question_words_lemma)

            if is_training:
                c_emb = tf.nn.dropout(c_emb, 1 - self.dropout)
                q_emb = tf.nn.dropout(q_emb, 1 - self.dropout)
                c_kg_emb = tf.nn.dropout(c_kg_emb, 1 - self.dropout)
                q_kg_emb = tf.nn.dropout(q_kg_emb, 1 - self.dropout)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]

            if options.with_char and char_vocab is not None:
                input_shape = tf.shape(self.in_question_chars)
                batch_size = input_shape[0]
                q_char_len = input_shape[2]
                input_shape = tf.shape(self.in_passage_chars)
                p_char_len = input_shape[2]
                char_dim = char_vocab.word_dim
                self.char_embedding = tf.get_variable(
                    "char_embedding",
                    initializer=tf.constant(char_vocab.word_vecs),
                    dtype=tf.float32)

                in_question_char_repres = tf.nn.embedding_lookup(
                    self.char_embedding, self.in_question_chars
                )  # [batch_size, question_len, q_char_len, char_dim]
                in_question_char_repres = tf.reshape(
                    in_question_char_repres, shape=[-1, q_char_len, char_dim])
                question_char_lengths = tf.reshape(self.question_char_lengths,
                                                   [-1])
                quesiton_char_mask = tf.sequence_mask(
                    question_char_lengths, q_char_len,
                    dtype=tf.float32)  # [batch_size*question_len, q_char_len]
                in_question_char_repres = tf.multiply(
                    in_question_char_repres,
                    tf.expand_dims(quesiton_char_mask, axis=-1))

                in_passage_char_repres = tf.nn.embedding_lookup(
                    self.char_embedding, self.in_passage_chars
                )  # [batch_size, passage_len, p_char_len, char_dim]
                in_passage_char_repres = tf.reshape(
                    in_passage_char_repres, shape=[-1, p_char_len, char_dim])
                passage_char_lengths = tf.reshape(self.passage_char_lengths,
                                                  [-1])
                passage_char_mask = tf.sequence_mask(
                    passage_char_lengths, p_char_len,
                    dtype=tf.float32)  # [batch_size*passage_len, p_char_len]
                in_passage_char_repres = tf.multiply(
                    in_passage_char_repres,
                    tf.expand_dims(passage_char_mask, axis=-1))

                question_char_outputs = conv(in_question_char_repres,
                                             self.options.char_lstm_dim,
                                             bias=True,
                                             activation=tf.nn.tanh,
                                             kernel_size=5,
                                             name="char_conv",
                                             reuse=False)
                question_char_outputs = tf.reduce_max(question_char_outputs,
                                                      axis=1)
                question_char_outputs = tf.reshape(
                    question_char_outputs,
                    [batch_size, question_len, options.char_lstm_dim])

                passage_char_outputs = conv(in_passage_char_repres,
                                            self.options.char_lstm_dim,
                                            bias=True,
                                            activation=tf.nn.tanh,
                                            kernel_size=5,
                                            name="char_conv",
                                            reuse=True)

                passage_char_outputs = tf.reduce_max(passage_char_outputs,
                                                     axis=1)
                passage_char_outputs = tf.reshape(
                    passage_char_outputs,
                    [batch_size, passage_len, options.char_lstm_dim])

                c_emb = tf.concat([c_emb, passage_char_outputs], axis=2)
                q_emb = tf.concat([q_emb, question_char_outputs], axis=2)

            c_mask = tf.sequence_mask(
                self.passage_lengths, passage_len,
                dtype=tf.float32)  # [batch_size, passage_len]
            q_mask = tf.sequence_mask(
                self.question_lengths, question_len,
                dtype=tf.float32)  # [batch_size, question_len]

        with tf.variable_scope("Embedding_Encoder_Layer"):
            q_emb = tf.multiply(q_emb, tf.expand_dims(q_mask, axis=-1))
            c_emb = tf.multiply(c_emb, tf.expand_dims(c_mask, axis=-1))

            q_kg_emb = tf.multiply(
                q_kg_emb, tf.expand_dims(tf.cast(q_mask, tf.float32), axis=-1))
            c_kg_emb = tf.multiply(
                c_kg_emb, tf.expand_dims(tf.cast(c_mask, tf.float32), axis=-1))

            (q_fw, q_bw, q) = layer_utils.my_lstm_layer(
                q_emb,
                options.context_lstm_dim,
                input_lengths=self.question_lengths,
                scope_name="context_represent",
                reuse=False,
                is_training=is_training,
                dropout_rate=self.dropout,
                use_cudnn=options.use_cudnn)

            (c_fw, c_bw,
             c) = layer_utils.my_lstm_layer(c_emb,
                                            options.context_lstm_dim,
                                            input_lengths=self.passage_lengths,
                                            scope_name="context_represent",
                                            reuse=True,
                                            is_training=is_training,
                                            dropout_rate=self.dropout,
                                            use_cudnn=options.use_cudnn)
            q = tf.multiply(q, tf.expand_dims(q_mask, axis=-1))
            c = tf.multiply(c, tf.expand_dims(c_mask, axis=-1))
            if is_training:
                q = tf.nn.dropout(q, 1 - self.dropout)
                c = tf.nn.dropout(c, 1 - self.dropout)
        with tf.variable_scope('co-att', reuse=tf.AUTO_REUSE):

            s = tf.einsum("abd,acd->abc", c, q)
            # cRq, loss = Complex(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim, loss_type='factorization')
            # cRq, loss, r = Analogy(c_kg_emb, q_kg_emb, c_mask, q_mask, options.scalar_dim,
            #                     options.kg_dim, options.relation_dim, loss_type='factorization')
            # cRq, loss = DisMult(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim, options.relation_dim, loss_type='factorization')
            cRq, r = Rescal(c_kg_emb, q_kg_emb, c_mask, q_mask, options.kg_dim,
                            options.relation_dim)

            # if is_training:
            v = tf.get_variable("v", [1, 1, 1, options.relation_dim],
                                dtype=tf.float32)
            score = tf.reduce_sum(cRq * v, axis=-1)
            s = s + options.lamda1 * score
            s = mask_relevancy_matrix(s, q_mask, c_mask)
            s_q = tf.nn.softmax(s, dim=1)
            self.v = v

            q2c = tf.einsum("abd,abc->acd", c, s_q)
            q2c_kg = tf.einsum("abd,abc->acd", c_kg_emb, s_q)
            q2c_kg_r = tf.einsum("abcr,abc->acr", cRq, s_q)
            s_c = tf.nn.softmax(s, dim=2)
            c2q = tf.einsum("abd,acb->acd", q, s_c)
            c2q_kg = tf.einsum("abd,acb->acd", q_kg_emb, s_c)
            c2q_kg_r = tf.einsum("abcr,abc->abr", cRq, s_c)

        with tf.variable_scope("Model_Encoder_Layer"):
            passage_inputs = tf.concat(
                [c2q, c, c2q * c, c - c2q, c_kg_emb, c2q_kg, c2q_kg_r], axis=2)
            question_inputs = tf.concat(
                [q2c, q, q2c * q, q - q2c, q_kg_emb, q2c_kg, q2c_kg_r], axis=2)
            passage_inputs = tf.layers.dense(inputs=passage_inputs,
                                             units=2 *
                                             options.context_lstm_dim,
                                             activation=tf.nn.relu,
                                             use_bias=True,
                                             name='pro',
                                             reuse=False)
            question_inputs = tf.layers.dense(inputs=question_inputs,
                                              units=2 *
                                              options.context_lstm_dim,
                                              activation=tf.nn.relu,
                                              use_bias=True,
                                              name='pro',
                                              reuse=True)
            question_inputs = tf.multiply(question_inputs,
                                          tf.expand_dims(q_mask, axis=-1))
            passage_inputs = tf.multiply(passage_inputs,
                                         tf.expand_dims(c_mask, axis=-1))

            (fw_rep, bw_rep,
             cur_aggregation_representation) = layer_utils.my_lstm_layer(
                 question_inputs,
                 options.aggregation_lstm_dim,
                 input_lengths=self.question_lengths,
                 scope_name='aggregate_layer',
                 reuse=False,
                 is_training=is_training,
                 dropout_rate=self.dropout,
                 use_cudnn=options.use_cudnn)

            question_inputs = cur_aggregation_representation
            # question_outputs_vec = tf.concat([fw_rep, bw_rep], axis=1)
            (fw_rep, bw_rep,
             cur_aggregation_representation) = layer_utils.my_lstm_layer(
                 passage_inputs,
                 options.aggregation_lstm_dim,
                 input_lengths=self.passage_lengths,
                 scope_name='aggregate_layer',
                 reuse=True,
                 is_training=is_training,
                 dropout_rate=self.dropout,
                 use_cudnn=options.use_cudnn)

            passage_inputs = cur_aggregation_representation

            question_inputs = tf.multiply(question_inputs,
                                          tf.expand_dims(q_mask, axis=-1))
            passage_inputs = tf.multiply(passage_inputs,
                                         tf.expand_dims(c_mask, axis=-1))

            if is_training:
                question_inputs = tf.nn.dropout(question_inputs,
                                                1 - self.dropout)
                passage_inputs = tf.nn.dropout(passage_inputs,
                                               1 - self.dropout)

            passage_outputs_mean = tf.div(
                tf.reduce_sum(passage_inputs, 1),
                tf.expand_dims(tf.cast(self.passage_lengths, tf.float32), -1))
            question_outputs_mean = tf.div(
                tf.reduce_sum(question_inputs, 1),
                tf.expand_dims(tf.cast(self.question_lengths, tf.float32), -1))
            passage_outputs_max = tf.reduce_max(passage_inputs, axis=1)
            question_outputs_max = tf.reduce_max(question_inputs, axis=1)

            passage_outputs_att = soft_attention_with_kg(passage_inputs,
                                                         c_kg_emb,
                                                         c2q_kg_r,
                                                         c_mask,
                                                         options.att_dim,
                                                         scope="soft_att",
                                                         reuse=False)
            question_outputs_att = soft_attention_with_kg(question_inputs,
                                                          q_kg_emb,
                                                          q2c_kg_r,
                                                          q_mask,
                                                          options.att_dim,
                                                          scope="soft_att",
                                                          reuse=True)

            question_outputs = tf.concat([
                question_outputs_max, question_outputs_mean,
                question_outputs_att
            ],
                                         axis=1)
            passage_outputs = tf.concat([
                passage_outputs_max, passage_outputs_mean, passage_outputs_att
            ],
                                        axis=1)

            match_representation = tf.concat(
                axis=1, values=[question_outputs, passage_outputs])
        # ========Prediction Layer=========
        match_dim = int(match_representation.shape[1])
        w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        if is_training:
            match_representation = tf.nn.dropout(match_representation,
                                                 (1 - self.dropout))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.nn.relu(logits)
        if is_training: logits = tf.nn.dropout(logits, (1 - self.dropout))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        self.predictions = tf.argmax(self.prob, 1)
        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        if not is_training: return

        if options.loss_type == 'logistic':
            matrix = self.matrix * 2 - 1
            matrix = mask_relevancy_4dmatrix(matrix, q_mask, c_mask)
            score = -1 * tf.log(tf.nn.sigmoid(matrix * cRq))
        else:
            score = self.matrix - cRq
            score = 1 / 2 * score * score

        score = mask_relevancy_4dmatrix(score, q_mask, c_mask)
        KGE_loss = tf.reduce_sum(score, axis=-1)

        self.loss = tf.reduce_sum(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))
        self.loss = self.loss + options.lamda2 * tf.reduce_sum(
            tf.layers.flatten(KGE_loss))

        tvars = tf.trainable_variables()
        if self.options.lambda_l2 > 0.0:
            l2_loss = tf.add_n([
                tf.nn.l2_loss(v) for v in tvars if tf.trainable_variables()
                if not 'embedding' in v.name
            ])
            self.loss = self.loss + self.options.lambda_l2 * l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(
                learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adagard':
            optimizer = tf.train.AdagradOptimizer(
                learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(
                MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
    def create_mpcnn_model_graph(self,
                                 num_classes,
                                 word_vocab=None,
                                 char_vocab=None,
                                 is_training=True,
                                 global_step=None):
        """
        """
        options = self.options
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.embedding = tf.placeholder(
                    tf.float32, shape=word_vocab.word_vecs.shape)
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=self.embedding,
                    dtype=tf.float32)  # tf.constant(word_vocab.word_vecs)

            in_question_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_question_words)  # [batch_size, question_len, word_dim]
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                self.in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(self.in_question_words)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(self.in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        in_question_repres = tf.concat(
            axis=2,
            values=in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            axis=2, values=in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        mask = tf.sequence_mask(self.passage_lengths,
                                passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            self.question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        in_question_repres = tf.expand_dims(
            in_question_repres, -1)  # [batch_size, question_len, word_dim, 1]
        in_passage_repres = tf.expand_dims(
            in_passage_repres, -1)  # [batch_size, passage_len, word_dim, 1]

        # ======Multi-perspective CNN Matching======
        filter_sizes = options.filter_sizes
        num_filters = options.num_filters
        poolings = list([tf.reduce_max, tf.reduce_min,
                         tf.reduce_mean])[:options.num_poolings]

        W1 = [
            tf.get_variable(
                "W1_%s" % i,
                initializer=tf.truncated_normal(
                    [filter_sizes[i], input_dim, 1, num_filters[0]],
                    stddev=0.1),
                dtype=tf.float32) for i in range(len(filter_sizes))
        ]
        b1 = [
            tf.get_variable("b1_%s" % i,
                            initializer=tf.constant(0.01,
                                                    shape=[num_filters[0]]),
                            dtype=tf.float32) for i in range(len(filter_sizes))
        ]

        W2 = [
            tf.get_variable(
                "W2_%s" % i,
                initializer=tf.truncated_normal(
                    [filter_sizes[i], input_dim, 1, num_filters[1]],
                    stddev=0.1),
                dtype=tf.float32) for i in range(len(filter_sizes) - 1)
        ]
        b2 = [
            tf.get_variable(
                "b2_%s" % i,
                initializer=tf.constant(0.01,
                                        shape=[num_filters[1], input_dim]),
                dtype=tf.float32) for i in range(len(filter_sizes) - 1)
        ]

        sent1_blockA = layer_utils.build_block_A(
            in_question_repres, filter_sizes, poolings, W1, b1, is_training
        )  # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A]
        sent2_blockA = layer_utils.build_block_A(
            in_passage_repres, filter_sizes, poolings, W1, b1, is_training
        )  # len(poolings) * len(filter_sizes) * [batch_size, 1, num_filters_A]

        sent1_blockB = layer_utils.build_block_B(
            in_question_repres, filter_sizes, poolings, W2, b2, is_training
        )  # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B]
        sent2_blockB = layer_utils.build_block_B(
            in_passage_repres, filter_sizes, poolings, W2, b2, is_training
        )  # (len(poolings))-1 * (len(filter_sizes)-1) * [batch_size, embed_size, num_filters_B]

        (match_representation, match_dim) = match_utils.mpcnn_match_func(
            sent1_blockA, sent2_blockA, sent1_blockB, sent2_blockB, poolings,
            filter_sizes, num_filters)

        #========Prediction Layer=========
        w_0 = tf.get_variable("w_0", [match_dim, int(match_dim / 2)],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [int(match_dim / 2)], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [int(match_dim / 2), num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.nn.relu(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)
        self.predictions = tf.argmax(self.prob, 1)

        gold_matrix = tf.one_hot(self.truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, self.truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))

        if not is_training: return

        if options.with_f1_metric:
            # acc, acc_op = tf.metrics.accuracy(labels=self.truth, predictions=self.predictions)
            precision, pre_op = tf.metrics.precision(
                labels=self.truth, predictions=self.predictions)
            recall, rec_op = tf.metrics.recall(labels=self.truth,
                                               predictions=self.predictions)
            f1 = 2 * precision * recall / (precision + recall + 1e-6)
            self.loss = self.loss - 0.1 * tf.reduce_mean(f1)

        tvars = tf.trainable_variables()
        if self.options.lambda_l1 > 0.0:
            l1_loss = tf.add_n([
                tf.contrib.layers.l1_regularizer(self.options.lambda_l1)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l1_loss
        if self.options.lambda_l2 > 0.0:
            # l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1])
            l2_loss = tf.add_n([
                tf.contrib.layers.l2_regularizer(self.options.lambda_l2)(v)
                for v in tvars if v.get_shape().ndims > 1
            ])
            self.loss = self.loss + l2_loss

        if self.options.optimize_type == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(
                learning_rate=self.options.learning_rate)
        elif self.options.optimize_type == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.options.learning_rate)

        grads = layer_utils.compute_gradients(self.loss, tvars)
        grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                  global_step=global_step)
        # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        if self.options.with_moving_average:
            # Track the moving averages of all trainable variables.
            MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
            variable_averages = tf.train.ExponentialMovingAverage(
                MOVING_AVERAGE_DECAY, global_step)
            variables_averages_op = variable_averages.apply(
                tf.trainable_variables())
            train_ops = [self.train_op, variables_averages_op]
            self.train_op = tf.group(*train_ops)
    def _build(self, in_passage_words, passage_lengths, in_question_words_soft,
               question_lengths, truth):
        """ truth: a int in [0 .. num_classes] indicating entailment
        """
        num_classes = self.num_classes
        word_vocab = self.word_vocab
        is_training = self.is_training
        global_step = self.global_step
        options = self.options
        # ======word representation layer======
        in_question_repres = []
        in_passage_repres = []
        input_dim = 0
        if word_vocab is not None:
            word_vec_trainable = True
            cur_device = '/gpu:0'
            if options.fix_word_vec:
                word_vec_trainable = False
                cur_device = '/cpu:0'
            with tf.device(cur_device):
                self.word_embedding = tf.get_variable(
                    "word_embedding",
                    trainable=word_vec_trainable,
                    initializer=tf.constant(word_vocab.word_vecs),
                    dtype=tf.float32)

            #in_question_word_repres = tf.nn.embedding_lookup(self.word_embedding, in_question_words_soft) # [batch_size, question_len, word_dim]
            in_question_word_repres = tx.utils.soft_sequence_embedding(
                self.word_embedding, in_question_words_soft)
            in_passage_word_repres = tf.nn.embedding_lookup(
                self.word_embedding,
                in_passage_words)  # [batch_size, passage_len, word_dim]
            in_question_repres.append(in_question_word_repres)
            in_passage_repres.append(in_passage_word_repres)

            input_shape = tf.shape(in_question_words_soft)
            batch_size = input_shape[0]
            question_len = input_shape[1]
            input_shape = tf.shape(in_passage_words)
            passage_len = input_shape[1]
            input_dim += word_vocab.word_dim

        in_question_repres = tf.concat(
            axis=2,
            values=in_question_repres)  # [batch_size, question_len, dim]
        in_passage_repres = tf.concat(
            axis=2, values=in_passage_repres)  # [batch_size, passage_len, dim]

        if is_training:
            in_question_repres = tf.nn.dropout(in_question_repres,
                                               (1 - options.dropout_rate))
            in_passage_repres = tf.nn.dropout(in_passage_repres,
                                              (1 - options.dropout_rate))

        mask = tf.sequence_mask(passage_lengths, passage_len,
                                dtype=tf.float32)  # [batch_size, passage_len]
        question_mask = tf.sequence_mask(
            question_lengths, question_len,
            dtype=tf.float32)  # [batch_size, question_len]

        # ======Highway layer======
        if options.with_highway:
            with tf.variable_scope("input_highway"):
                in_question_repres = match_utils.multi_highway_layer(
                    in_question_repres, input_dim, options.highway_layer_num)
                tf.get_variable_scope().reuse_variables()
                in_passage_repres = match_utils.multi_highway_layer(
                    in_passage_repres, input_dim, options.highway_layer_num)

        # in_question_repres = tf.multiply(in_question_repres, tf.expand_dims(question_mask, axis=-1))
        # in_passage_repres = tf.multiply(in_passage_repres, tf.expand_dims(mask, axis=-1))

        # ========Bilateral Matching=====
        (match_representation,
         match_dim) = match_utils.bilateral_match_func(in_question_repres,
                                                       in_passage_repres,
                                                       question_lengths,
                                                       passage_lengths,
                                                       question_mask,
                                                       mask,
                                                       input_dim,
                                                       is_training,
                                                       options=options)

        #========Prediction Layer=========
        # match_dim = 4 * self.options.aggregation_lstm_dim
        w_0 = tf.get_variable("w_0", [match_dim, match_dim / 2],
                              dtype=tf.float32)
        b_0 = tf.get_variable("b_0", [match_dim / 2], dtype=tf.float32)
        w_1 = tf.get_variable("w_1", [match_dim / 2, num_classes],
                              dtype=tf.float32)
        b_1 = tf.get_variable("b_1", [num_classes], dtype=tf.float32)

        # if is_training: match_representation = tf.nn.dropout(match_representation, (1 - options.dropout_rate))
        logits = tf.matmul(match_representation, w_0) + b_0
        logits = tf.tanh(logits)
        if is_training:
            logits = tf.nn.dropout(logits, (1 - options.dropout_rate))
        logits = tf.matmul(logits, w_1) + b_1

        self.prob = tf.nn.softmax(logits)

        gold_matrix = tf.one_hot(truth, num_classes, dtype=tf.float32)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=gold_matrix))

        correct = tf.nn.in_top_k(logits, truth, 1)
        self.eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        self.predictions = tf.argmax(self.prob, 1)

        if is_training:
            tvars = tf.trainable_variables()
            if self.options.lambda_l2 > 0.0:
                l2_loss = tf.add_n([
                    tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1
                ])
                self.loss = self.loss + self.options.lambda_l2 * l2_loss

            if self.options.optimize_type == 'adadelta':
                optimizer = tf.train.AdadeltaOptimizer(
                    learning_rate=self.options.learning_rate)
            elif self.options.optimize_type == 'adam':
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.options.learning_rate)

            grads = layer_utils.compute_gradients(self.loss, tvars)
            grads, _ = tf.clip_by_global_norm(grads, self.options.grad_clipper)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                      global_step=global_step)
            # self.train_op = optimizer.apply_gradients(zip(grads, tvars))

            if self.options.with_moving_average:
                # Track the moving averages of all trainable variables.
                MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
                variable_averages = tf.train.ExponentialMovingAverage(
                    MOVING_AVERAGE_DECAY, global_step)
                variables_averages_op = variable_averages.apply(
                    tf.trainable_variables())
                train_ops = [self.train_op, variables_averages_op]
                self.train_op = tf.group(*train_ops)

        return {
            "logits": logits,
            "prob": self.prob,
            "loss": self.loss,
            "correct": correct,
            "eval_correct": self.eval_correct,
            "predictions": self.predictions,
        }