Beispiel #1
0
 def _answer_layer(self, doc, candidates):
     'doc # [N,dim] candidates: [N,num_candidates, dim]'
     candidates = nn.highway_network(candidates, self.config.highway_layers, True, is_train=self.is_train)
     candidates = tf.layers.dense(candidates, doc.get_shape().as_list()[-1])
     doc = tf.reshape(doc, [tf.shape(doc)[0], 1, tf.shape(doc)[1]])
     logit = tf.matmul(doc, candidates, transpose_b=True)  # [N, 1, num_candidates]
     return tf.reshape(logit, [tf.shape(logit)[0], tf.shape(logit)[2]])
Beispiel #2
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            0, [word_emb_mat, self.new_emb_mat])
                    print(word_emb_mat.get_shape().as_list())

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq

                xx = Ax
                qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        cell = BasicLSTMCell(d, state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell,
                                             d_cell,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(2, [fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0, p1 = attention_layer(config,
                                         self.is_train,
                                         h,
                                         u,
                                         h_mask=self.x_mask,
                                         u_mask=self.q_mask,
                                         scope="p0",
                                         tensor_dict=self.tensor_dict)
                first_cell = d_cell
        # with tf.variable_scope("activate"):
        #     p0 =  tf.nn.relu(_linear(tf.reshape(p0,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu'))
        #     if config.share_lstm_weights:
        #         tf.get_variable_scope().reuse_variables()
        #         p1 =  tf.nn.relu(_linear(tf.reshape(p1,[-1,1200]),300,bias=0.01,bias_start=0.0,scope='relu'))
        with tf.variable_scope('two_lstm'):
            p0 = tf.reshape(p0, [N, 1, -1, 300])
            p1 = tf.reshape(p1, [N, 1, -1, 300])
            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(3, [fw_g0, bw_g0])
            q_len_new = tf.tile(tf.expand_dims(q_len, 1), [1, M])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                    first_cell,
                    first_cell,
                    p1,
                    q_len_new,
                    dtype='float',
                    scope='g0')  # [N, M, JX, 2d]
                g1 = tf.concat(3, [fw_g1, bw_g1])
        # with tf.variable_scope('two_lstm_1'):
        #     (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g0, x_len, dtype='float', scope='g0')  # [N, M, JX, 2d]
        #     g2 = tf.concat(3, [fw_g2, bw_g2])
        #     q_len_new = tf.tile(tf.expand_dims(q_len,1),[1,M])
        #     if config.share_lstm_weights:
        #         tf.get_variable_scope().reuse_variables()
        #         (fw_g3, bw_g3), _ = bidirectional_dynamic_rnn(first_cell, first_cell, g1, q_len_new, dtype='float', scope='g0')  # [N, M, JX, 2d]
        #         g3 = tf.concat(3, [fw_g3, bw_g3])

            g0 = tf.reduce_sum(tf.reduce_max(g0, 2), 1)
            g1 = tf.reduce_sum(tf.reduce_max(g1, 2), 1)

        logits = _linear([g0, g1, tf.abs(tf.subtract(g0, g1)), g0 * g1],
                         2,
                         bias=0.01,
                         bias_start=0.0,
                         scope='logits1')

        flat_logits2 = tf.reshape(logits, [N, 2])

        yp = tf.nn.softmax(flat_logits2)  # [-1, M*JX]

        self.tensor_dict['g0'] = g0
        self.tensor_dict['g1'] = g1

        self.logits = flat_logits2

        self.yp = yp
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        JA = config.max_answer_length
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            # Char-CNN Embedding
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            # Word Embedding
            if config.use_word_emb:
                with tf.variable_scope("emb_var") as scope, tf.device(
                        "/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    tf.get_variable_scope().reuse_variables()
                    self.word_emb_scope = scope
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            [word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                # Concat Char-CNN Embedding and Word Embedding
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

            # exact match
            if config.use_exact_match:
                emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1)
                xx = tf.concat([xx, emx], 3)  # [N, M, JX, di+1]
                emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1)
                qq = tf.concat([qq, emq], 2)  # [N, JQ, di+1]

        # 2 layer highway network on Concat Embedding
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        # Bidirection-LSTM (3rd layer on paper)
        cell = GRUCell(d) if config.GRU else BasicLSTMCell(d,
                                                           state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), _ = bidirectional_dynamic_rnn(
                d_cell, d_cell, qq, q_len, dtype='float',
                scope='u1')  # [N, J, d], [N, d]
            u = tf.concat([fw_u, bw_u], 2)
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        # Attention Flow Layer (4th layer on paper)
        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    size=d,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell

        # Modeling layer (5th layer on paper)
            tp0 = p0
            for layer_idx in range(config.LSTM_num_layers - 1):
                (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                    first_cell,
                    first_cell,
                    p0,
                    x_len,
                    dtype='float',
                    scope="g_{}".format(layer_idx))  # [N, M, JX, 2d]
                p0 = tf.concat([fw_g0, bw_g0], 3)
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat([fw_g1, bw_g1], 3)  # [N, M, JX, 2d]

        # Self match layer
        with tf.variable_scope("SelfMatch"):
            s0 = tf.reshape(g1, [N * M, JX, 2 * d])  # [N * M, JX, 2d]
            x_mask = tf.reshape(self.x_mask, [N * M, JX])
            first_cell = AttentionCell(cell,
                                       s0,
                                       size=d,
                                       mask=x_mask,
                                       is_train=self.is_train)
            (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(
                first_cell, first_cell, s0, x_len, dtype='float',
                scope='s')  # [N, M, JX, 2d]
            s1 = tf.concat([fw_s, bw_s], 2)  # [N * M, JX, 2d], M == 1

            # prepare for PtrNet
            encoder_output = tf.expand_dims(s1, 1)  # [N, M, JX, 2d]
            encoder_output = tf.expand_dims(
                tf.cast(self.x_mask,
                        tf.float32), -1) * encoder_output  # [N, M, JX, 2d]

            if config.GRU:
                encoder_state_final = tf.concat((fw_s_f, bw_s_f),
                                                1,
                                                name='encoder_concat')
            else:
                if isinstance(fw_s_f, LSTMStateTuple):
                    encoder_state_c = tf.concat((fw_s_f.c, bw_s_f.c),
                                                1,
                                                name='encoder_concat_c')
                    encoder_state_h = tf.concat((fw_s_f.h, bw_s_f.h),
                                                1,
                                                name='encoder_concat_h')
                    encoder_state_final = LSTMStateTuple(c=encoder_state_c,
                                                         h=encoder_state_h)
                elif isinstance(fw_s_f, tf.Tensor):
                    encoder_state_final = tf.concat((fw_s_f, bw_s_f),
                                                    1,
                                                    name='encoder_concat')
                else:
                    encoder_state_final = None
                    tf.logging.error("encoder_state_final not set")

            print("encoder_state_final:", encoder_state_final)

        with tf.variable_scope("output"):
            # eos_symbol = config.eos_symbol
            # next_symbol = config.next_symbol

            tf.assert_equal(
                M,
                1)  # currently dynamic M is not supported, thus we assume M==1
            answer_string = tf.placeholder(
                shape=(N, 1, JA + 1), dtype=tf.int32,
                name='answer_string')  # [N, M, JA + 1]
            answer_string_mask = tf.placeholder(
                shape=(N, 1, JA + 1), dtype=tf.bool,
                name='answer_string_mask')  # [N, M, JA + 1]
            answer_string_length = tf.placeholder(
                shape=(N, 1),
                dtype=tf.int32,
                name='answer_string_length',
            )  # [N, M]
            self.tensor_dict['answer_string'] = answer_string
            self.tensor_dict['answer_string_mask'] = answer_string_mask
            self.tensor_dict['answer_string_length'] = answer_string_length
            self.answer_string = answer_string
            self.answer_string_mask = answer_string_mask
            self.answer_string_length = answer_string_length

            answer_string_flattened = tf.reshape(answer_string,
                                                 [N * M, JA + 1])
            self.answer_string_flattened = answer_string_flattened  # [N * M, JA+1]
            print("answer_string_flattened:", answer_string_flattened)

            answer_string_length_flattened = tf.reshape(
                answer_string_length, [N * M])
            self.answer_string_length_flattened = answer_string_length_flattened  # [N * M]
            print("answer_string_length_flattened:",
                  answer_string_length_flattened)

            decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(
                2 * d, state_is_tuple=True)

            with tf.variable_scope("Decoder"):
                decoder_train_logits = ptr_decoder(
                    decoder_cell,
                    tf.reshape(tp0, [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                    tf.reshape(encoder_output,
                               [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                    encoder_final_state=encoder_state_final,
                    max_encoder_length=config.sent_size_th,
                    decoder_output_length=
                    answer_string_length_flattened,  # [N * M]
                    batch_size=N,  # N * M (M=1)
                    attention_proj_dim=self.config.decoder_proj_dim,
                    scope='ptr_decoder'
                )  # [batch_size, dec_len*, enc_seq_len + 1]

                self.decoder_train_logits = decoder_train_logits
                print("decoder_train_logits:", decoder_train_logits)
                self.decoder_train_softmax = tf.nn.softmax(
                    self.decoder_train_logits)
                self.decoder_inference = tf.argmax(
                    decoder_train_logits, axis=2,
                    name='decoder_inference')  # [N, JA + 1]

            self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1
            self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
Beispiel #4
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=tf.random_normal_initializer)
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        cell_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        cell2_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        d_cell2_fw = SwitchableDropoutWrapper(
            cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(
            cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        cell3_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        d_cell3_fw = SwitchableDropoutWrapper(
            cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(
            cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        cell4_bw = LSTMCell(d, state_is_tuple=True, name="basic_lstm_cell")
        d_cell4_fw = SwitchableDropoutWrapper(
            cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(
            cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw,
                                             d_cell_bw,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell_fw = AttentionCell(
                    cell2_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                first_cell_bw = AttentionCell(
                    cell2_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_fw = AttentionCell(
                    cell3_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_bw = AttentionCell(
                    cell3_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell_fw,
                first_cell_bw,
                p0,
                x_len,
                dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                second_cell_fw,
                second_cell_bw,
                g0,
                x_len,
                dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])

            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell4_fw,
                d_cell4_bw,
                tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)

            if config.na:
                na_bias = tf.get_variable("na_bias", shape=[], dtype='float')
                na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]),
                                        [N, 1])  # [N, 1]
                concat_flat_logits = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits])
                concat_flat_yp = tf.nn.softmax(concat_flat_logits)
                na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]),
                                     [1])
                flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1])

                concat_flat_logits2 = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits2])
                concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2)
                na_prob2 = tf.squeeze(
                    tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1])  # [N]
                flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1])

                self.concat_logits = concat_flat_logits
                self.concat_logits2 = concat_flat_logits2
                self.na_prob = na_prob * na_prob2

            yp = tf.reshape(flat_yp, [-1, M, JX], name="yp")
            yp2 = tf.reshape(flat_yp2, [-1, M, JX], name="yp2")
            wyp = tf.nn.sigmoid(logits2, name="wyp")

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
            self.wyp = wyp
Beispiel #5
0
    def _build_forward(self):
        config = self.config

        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        N, M, JX, JQ, VW, VC, d, W = \
          config.batch_size, config.max_num_sents, config.max_sent_size, \
          config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
          config.max_word_size

        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(self.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq

                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train,
                                     input_keep_prob=config.highway_keep_prob)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train,
                                     input_keep_prob=config.highway_keep_prob)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        with tf.variable_scope("prepro"):
            with tf.variable_scope('u1'):
                u, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1,
                                            1 - config.input_keep_prob, qq,
                                            q_len, self.is_train)
                if config.reasoning_layer == 'snmn':
                    u_st = zhong_selfatt(u[:, ax, :, :],
                                         config.hidden_size * 2,
                                         seq_len=q_len,
                                         transform='squeeze')

            if config.share_lstm_weights:
                with tf.variable_scope('u1', reuse=True):
                    h, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1,
                                                1 - config.input_keep_prob,
                                                tf.squeeze(xx, axis=1),
                                                tf.squeeze(x_len, axis=1),
                                                self.is_train)
                    h = h[:, ax, :, :]
            else:
                with tf.variable_scope('h1'):
                    h, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1,
                                                1 - config.input_keep_prob,
                                                tf.squeeze(xx, axis=1),
                                                tf.squeeze(x_len, axis=1),
                                                self.is_train)
                    h = h[:, ax, :, :]

            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            context_dim = config.hidden_size * 2
            ### Reconstruct before bidaf because otherwise we need to build a larger query tensor.

            x_mask = self.x_mask
            x_len_squeeze = tf.squeeze(x_len, axis=1)
            p0 = h

            ### Main model
            if config.reasoning_layer == 'snmn':
                module_names = ['_Find', '_Compare', '_Relocate', '_NoOp']

                self.snmn = NMN_Model(config, u, qq, u_st, self.q_mask, q_len, p0, x_mask, x_len, module_names, \
                                      self.is_train)
                self.u_weights = self.snmn.cv_list  # question word distribution at each step
                self.module_prob_list = self.snmn.module_prob_list  # module probability at each step

                g0 = tf.squeeze(self.snmn.att_second, axis=-1)

                if config.supervise_bridge_entity:
                    self.hop0_logits = self.snmn.bridge_logits

                if config.self_att:
                    with tf.variable_scope('g0'):
                        g0, _ = bi_cudnn_rnn_encoder(
                            'lstm', config.hidden_size,
                            1, 1 - config.input_keep_prob,
                            tf.squeeze(g0,
                                       axis=1), x_len_squeeze, self.is_train)
                        g0 = g0[:, ax, :, :]
                        g0 = hotpot_biattention(config,
                                                self.is_train,
                                                g0,
                                                tf.squeeze(g0, axis=1),
                                                h_mask=x_mask,
                                                u_mask=tf.squeeze(x_mask,
                                                                  axis=1),
                                                scope="self_att",
                                                tensor_dict=self.tensor_dict)
                    g0 = tf.layers.dense(g0, config.hidden_size * 2)

                with tf.variable_scope('g1'):
                    g1, _ = bi_cudnn_rnn_encoder('lstm', config.hidden_size, 1,
                                                 1 - config.input_keep_prob,
                                                 tf.squeeze(g0, axis=1),
                                                 tf.squeeze(x_len, axis=1),
                                                 self.is_train)
                    g1 = g1[:, ax, :, :]

                logits = get_logits([g1, g0],
                                    d,
                                    True,
                                    wd=config.wd,
                                    input_keep_prob=config.input_keep_prob,
                                    mask=x_mask,
                                    is_train=self.is_train,
                                    func=config.answer_func,
                                    scope='logits1')

                with tf.variable_scope('g2'):
                    a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                                  tf.reshape(logits, [N, M * JX]))
                    a1i = tf.tile(a1i[:, ax, ax, :], [1, M, JX, 1])
                    g2, _ = bi_cudnn_rnn_encoder(
                        'lstm', config.hidden_size, 1,
                        1 - config.input_keep_prob,
                        tf.squeeze(tf.concat(axis=3,
                                             values=[g0, g1, a1i, g0 * a1i]),
                                   axis=1), x_len_squeeze, self.is_train)
                    g2 = g2[:, ax, :, :]
                logits2 = get_logits([g2, g1],
                                     d,
                                     True,
                                     wd=config.wd,
                                     input_keep_prob=config.input_keep_prob,
                                     mask=x_mask,
                                     is_train=self.is_train,
                                     func=config.answer_func,
                                     scope='logits2')

                if config.dataset == 'hotpotqa':
                    with tf.variable_scope('g3'):
                        if config.nmn_qtype_class == 'mem_last':
                            g3 = tf.concat(
                                [self.snmn.mem_last[:, ax, :], u_st[:, ax, :]],
                                axis=-1)
                        elif config.nmn_qtype_class == 'ctrl_st':
                            g3 = self.snmn.c_st_list[0][:, ax, :]
                        else:
                            raise NotImplementedError

                        self.predict_type = dense(g3, 2, scope='predict_type')
                        g3_1 = self.snmn.mem_last[:, ax, :]
                        self.predict_yesno = dense(g3_1,
                                                   2,
                                                   scope='predict_yesno')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M * JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)
            yp = tf.reshape(flat_yp, [-1, M, JX])
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])
            wyp = tf.nn.sigmoid(logits2)
            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
            self.wyp = wyp

            if config.dataset == 'hotpotqa':
                flat_predict_type = tf.reshape(self.predict_type, [-1, 2])
                flat_yp3 = tf.nn.softmax(flat_predict_type)
                self.yp3 = tf.reshape(flat_yp3, [-1, 1, 2])

                flat_predict_yesno = tf.reshape(self.predict_yesno, [-1, 2])
                flat_yp3_yesno = tf.nn.softmax(flat_predict_yesno)
                self.yp3_yesno = tf.reshape(flat_yp3_yesno, [-1, 1, 2])
Beispiel #6
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size,  config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JQ = JX
        print('VC:{}  NEW_EMB:{}'.format(VW, self.new_emb_mat.get_shape()))
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")

                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]

                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]

                else:
                    xx = Ax
                    qq = Aq
                    xx = tf.reshape(xx, [-1, M, JX, d])
                    qq = tf.reshape(qq, [-1, JQ, d])
            if config.use_pos_emb:
                with tf.variable_scope("pos_onehot"), tf.device("/cpu:0"):
                    pos_x = tf.one_hot(
                        self.x_pos, depth=config.pos_tag_num)  # [N,M,JX,depth]
                    pos_q = tf.one_hot(
                        self.q_pos, depth=config.pos_tag_num)  # [N,JQ,depth]
                    xx = tf.concat(axis=3, values=[xx,
                                                   pos_x])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, pos_q])
            if config.use_sem_emb:
                with tf.variable_scope("sem_onehot"), tf.device("/cpu:0"):
                    sem_x = tf.one_hot(self.x_sem, depth=3)  # [N,M,JX,3]
                    sem_q = tf.one_hot(self.q_sem, depth=3)  # [N,JQ,3]
                    xx = tf.concat(axis=3, values=[xx, sem_x])
                    qq = tf.concat(axis=2, values=[qq, sem_q])
            if config.use_neg_emb:
                with tf.variable_scope("neg_onehot"), tf.device("/cpu:0"):
                    neg_x = tf.one_hot(self.x_neg, depth=2)  # [N,M,JX,2]
                    neg_q = tf.one_hot(self.q_neg, depth=2)  # [N,JQ,2]
                    xx = tf.concat(axis=3, values=[xx, neg_x])
                    qq = tf.concat(axis=2, values=[qq, neg_q])

        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell_fw2 = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw2 = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw2 = SwitchableDropoutWrapper(
            cell_fw2, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw2 = SwitchableDropoutWrapper(
            cell_bw2, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]
        if config.lstm:
            with tf.variable_scope("prepro"):
                (fw_u, bw_u), ((_, fw_u_f),
                               (_, bw_u_f)) = bidirectional_dynamic_rnn(
                                   d_cell_fw,
                                   d_cell_bw,
                                   qq,
                                   q_len,
                                   dtype='float',
                                   scope='u1')  # [N, J, d], [N, d]
                print('fw_u_f hsape:{}'.format(fw_u_f.get_shape()))
                u = tf.concat(axis=2, values=[fw_u, bw_u])  #[N,JQ,2d]
                if config.share_lstm_weights:
                    tf.get_variable_scope().reuse_variables()
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell_fw, cell_bw, xx, x_len, dtype='float',
                        scope='u1')  # [N, M, JX, 2d]
                    h = tf.concat(axis=3, values=[fw_h,
                                                  bw_h])  # [N, M, JX, 2d]
                    print('fw_u_f nn hsape:{}'.format(fw_u_f.get_shape()))
                else:
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell_fw, cell_bw, xx, x_len, dtype='float',
                        scope='h1')  # [N, M, JX, 2d]
                    h = tf.concat(axis=3, values=[fw_h,
                                                  bw_h])  # [N, M, JX, 2d]
                self.tensor_dict['u'] = u
                self.tensor_dict['h'] = h
        else:
            h = xx
            u = qq
        h1 = h[:, 0, :, :]
        h2 = h[:, 1, :, :]
        h3 = h[:, 2, :, :]
        h4 = h[:, 3, :, :]

        n_1 = tf.reshape(self.x_mask[:, 0, :], [N, JX])
        n_2 = tf.reshape(self.x_mask[:, 1, :], [N, JX])
        n_3 = tf.reshape(self.x_mask[:, 2, :], [N, JX])
        n_4 = tf.reshape(self.x_mask[:, 3, :], [N, JX])

        if config.self_attention:
            with tf.variable_scope("h_self_weight"):

                print(h.get_shape())
                for i in range(2):
                    with tf.variable_scope("self-attention"):

                        h1 = self_attention_layer(
                            config,
                            self.is_train,
                            h1,
                            p_mask=tf.expand_dims(n_1, -1),
                            scope="{}_layer_self_att_enc_e".format(
                                i))  # [N, len, dim]
                        tf.get_variable_scope().reuse_variables()
                        h2 = self_attention_layer(
                            config,
                            self.is_train,
                            h2,
                            p_mask=tf.expand_dims(n_2, -1),
                            scope="{}_layer_self_att_enc_e".format(i))
                        tf.get_variable_scope().reuse_variables()
                        h3 = self_attention_layer(
                            config,
                            self.is_train,
                            h3,
                            p_mask=tf.expand_dims(n_3, -1),
                            scope="{}_layer_self_att_enc_e".format(i))
                        tf.get_variable_scope().reuse_variables()
                        h4 = self_attention_layer(
                            config,
                            self.is_train,
                            h4,
                            p_mask=tf.expand_dims(n_4, -1),
                            scope="{}_layer_self_att_enc_e".format(i))
                    with tf.variable_scope("self-attention"):
                        u = self_attention_layer(
                            config,
                            self.is_train,
                            u,
                            p_mask=tf.expand_dims(self.q_mask, -1),
                            scope="{}_layer_self_att_enc_p".format(i))
        if config.plot_encoder == "concate":
            h = tf.concat([h1, h2, h3, h4], axis=1)
            print("h concate shape".format(h.get_shape()))
            n_n = tf.concat([n_1, n_2, n_3, n_4], axis=1)
        elif config.plot_encoder == "sum":
            h1 = tf.expand_dims(h1, axis=1)
            h2 = tf.expand_dims(h2, axis=1)
            h3 = tf.expand_dims(h3, axis=1)
            h4 = tf.expand_dims(h4, axis=1)
            h = tf.concat([h1, h2, h3, h4], axis=1)

            h = tf.reduce_sum(h, axis=1)
            print("h sum shape".format(h.get_shape()))
        elif config.plot_encoder == "lstm":
            # h1 = tf.reduce_sum(h1, axis=1)
            h1 = tf.expand_dims(tf.reduce_sum(h1, axis=-1), axis=1)
            h2 = tf.expand_dims(tf.reduce_sum(h2, axis=-1), axis=1)
            h3 = tf.expand_dims(tf.reduce_sum(h3, axis=-1), axis=1)
            h4 = tf.expand_dims(tf.reduce_sum(h4, axis=-1), axis=1)
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw2,
                                             d_cell_bw2,
                                             tf.concat([h1, h2, h3, h4],
                                                       axis=1),
                                             dtype='float',
                                             scope='1')  # [N, J, d], [N, d]
            print('fw_u_f hsape:{}'.format(fw_u_f.get_shape()))
            h = tf.concat(axis=2, values=[fw_u, bw_u])  # [N,JQ,2d]
            u = tf.expand_dims(tf.reduce_sum(u, axis=-1), axis=1)
            tf.get_variable_scope().reuse_variables()
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw2,
                                             d_cell_bw2,
                                             tf.concat([u], axis=1),
                                             dtype='float',
                                             scope='1')  # [N, J, d], [N, d]
            print('fw_u_f hsape:{}'.format(fw_u_f.get_shape()))
            u = tf.concat(axis=2, values=[fw_u, bw_u])  # [N,JQ,2d]

        if config.interact:
            with tf.variable_scope("interact"):

                def get_attention(h, u, m):
                    JX = tf.shape(h)[1]
                    JQ = tf.shape(u)[1]
                    h = tf.expand_dims(h, 2)
                    u = tf.expand_dims(u, 1)
                    h = tf.tile(h, [1, 1, JQ, 1])
                    u = tf.tile(u, [1, JX, 1, 1])
                    attention = h * u  # N,JX,JQ,2d

                    return attention

                if config.plot_encoder == "concate":
                    attention = get_attention(h, u, M)
                else:
                    attention = get_attention(h, u, 1)

            with tf.variable_scope('conv_dense'):
                if config.plot_encoder == "concate":
                    out_final = dense_net(config, attention, self.is_train)
                else:
                    out_final = tf.reshape(attention, shape=[N, -1])

        else:
            h = tf.reshape(h, [-1, M * 2 * d * JX])
            print("h shape {}".format(h.get_shape()))
            u = tf.reshape(u, [-1, 2 * d * JQ])
            print("U shape {}".format(u.get_shape()))
            attention = tf.concat([h, u], axis=-1)
            out_final = attention

            out_final = linear(tf.concat([attention], axis=-1),
                               1000,
                               True,
                               bias_start=0.0,
                               scope="logit8",
                               squeeze=False,
                               wd=config.wd,
                               input_keep_prob=config.output_keep_pro,
                               is_train=self.is_train)
            out_final = tf.nn.relu(out_final)
            out_final = linear(tf.concat([out_final], axis=-1),
                               400,
                               True,
                               bias_start=0.0,
                               scope="logit9",
                               squeeze=False,
                               wd=config.wd,
                               input_keep_prob=config.output_keep_pro,
                               is_train=self.is_train)
            out_final = tf.nn.relu(out_final)

            out_final = linear(out_final,
                               300,
                               True,
                               bias_start=0.0,
                               scope="logit3",
                               squeeze=False,
                               wd=config.wd,
                               input_keep_prob=config.output_keep_pro,
                               is_train=self.is_train)

            out_final = tf.nn.relu(out_final)

        with tf.variable_scope('conv_dense'):

            if config.hao:
                out_final = linear(tf.concat(
                    [out_final, self.haoruopeng_feature], axis=-1),
                                   200,
                                   True,
                                   bias_start=0.0,
                                   scope="logit",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)
                out_final = tf.nn.relu(out_final)
                out_final = linear(out_final,
                                   100,
                                   True,
                                   bias_start=0.0,
                                   scope="logit3",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)

                out_final = tf.nn.relu(out_final)
            else:
                out_final = linear(tf.concat([out_final], axis=-1),
                                   200,
                                   True,
                                   bias_start=0.0,
                                   scope="logit",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)
                out_final = linear(out_final,
                                   100,
                                   True,
                                   bias_start=0.0,
                                   scope="logit3",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)

                out_final = tf.nn.relu(out_final)

            self.tensor_dict['outfinal'] = out_final
            self.prediction = linear(tf.concat([out_final], axis=-1),
                                     1,
                                     True,
                                     bias_start=0.0,
                                     scope="logit2",
                                     squeeze=False,
                                     wd=config.wd,
                                     input_keep_prob=config.output_keep_pro,
                                     is_train=self.is_train)
Beispiel #7
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(
            cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(
            cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(
            cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(
            cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(
            cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(
            cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw,
                                             d_cell_bw,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell_fw = AttentionCell(
                    cell2_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                first_cell_bw = AttentionCell(
                    cell2_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_fw = AttentionCell(
                    cell3_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_bw = AttentionCell(
                    cell3_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            config.ruminating_layer = True

            if config.ruminating_layer:
                '''
                RUMINATING LAYER
                '''

                with tf.variable_scope('rum_layer'):
                    print('-' * 5 + "RUMINATING LAYER" + '-' * 5)
                    print("Context", xx)  #[N,M,JX,2d]
                    print("Question", qq)  #[N,JQ,2d]
                    print("p0", p0)  #[N,M,JX,8D]

                    sum_cell = BasicLSTMCell(d, state_is_tuple=True)
                    (s_f,
                     s_b), _ = bidirectional_dynamic_rnn(sum_cell,
                                                         sum_cell,
                                                         p0,
                                                         x_len,
                                                         dtype=tf.float32,
                                                         scope="sum_layer")

                    batch_lens = (tf.reshape(x_len, [N * M]))
                    s_f = tf.reshape(s_f, [N * M, JX, d])
                    s_b = tf.reshape(s_b, [N * M, JX, d])

                    s_fout = tf.reshape(extract_axis_1(s_f, batch_lens),
                                        [N, M, d])
                    s_bout = tf.reshape(extract_axis_1(s_b, batch_lens),
                                        [N, M, d])

                    s = tf.concat(axis=2, values=[s_fout, s_bout])  # [N,M,2d]

                    print("summarization layer", s)

                    print('-' * 5 + "QUESTION RUMINATE LAYER" + '-' * 5)

                    S_Q = tf.tile(tf.expand_dims(s, 2),
                                  [1, 1, JQ, 1])  # [N,M,JQ,2d]
                    S_cell_fw = BasicLSTMCell(d, state_is_tuple=True)
                    S_cell_bw = BasicLSTMCell(d, state_is_tuple=True)
                    (fw_hq,
                     bw_hq), _ = bidirectional_dynamic_rnn(S_cell_fw,
                                                           S_cell_bw,
                                                           S_Q,
                                                           q_len,
                                                           dtype=tf.float32,
                                                           scope="S_Q")
                    S_Q = tf.concat(axis=3, values=[fw_hq, bw_hq])
                    q_m = tf.reshape(tf.expand_dims(qq, 1), [N, M, JQ, 2 * d])

                    with tf.variable_scope("question_rum_layer"):
                        Q_hat = ruminating_layer(S_Q, q_m, N, M, JQ, d)

                    print("Q_hat", Q_hat)  #[N,M,JQ,2d]

                    print('-' * 5 + "CONTEXT RUMINATE LAYER" + '-' * 5)
                    S_C = tf.tile(tf.expand_dims(s, 2),
                                  [1, 1, JX, 1])  # [N,M,JX,2d]

                    C_cell_fw = BasicLSTMCell(d, state_is_tuple=True)
                    C_cell_bw = BasicLSTMCell(d, state_is_tuple=True)
                    (fw_h,
                     bw_h), _ = bidirectional_dynamic_rnn(C_cell_fw,
                                                          C_cell_bw,
                                                          S_C,
                                                          x_len,
                                                          dtype=tf.float32,
                                                          scope="S_C")
                    S_C = tf.concat(axis=3, values=[fw_h, bw_h])  #[N,M,JX,2d]
                    with tf.variable_scope("context_rum_layer"):
                        C_hat = ruminating_layer(S_C, xx, N, M, JX, d)

                    print("C_hat", C_hat)  #[N,M,JX,2d]

                    #Second Hop bi-Attention

                    print('-' * 5 + "SECOND HOP ATTENTION" + '-' * 5)
                    sh_aug = tf.tile(tf.expand_dims(C_hat, 3),
                                     [1, 1, 1, JQ, 1])  #[N,M,JX,2d]
                    su_aug = tf.tile(tf.expand_dims(Q_hat, 2),
                                     [1, 1, JX, 1, 1])  #[N,M,JQ,2d]

                    sh_mask_aug = tf.tile(tf.expand_dims(self.x_mask, -1),
                                          [1, 1, 1, JQ])
                    su_mask_aug = tf.tile(
                        tf.expand_dims(tf.expand_dims(self.q_mask, 1), 1),
                        [1, M, JX, 1])
                    shu_mask = sh_mask_aug & su_mask_aug
                    su_logits = get_logits([sh_aug, su_aug],
                                           None,
                                           True,
                                           wd=config.wd,
                                           mask=shu_mask,
                                           is_train=True,
                                           func=config.logit_func,
                                           scope='su_logits')
                    su_a = softsel(su_aug, su_logits)
                    sh_a = softsel(C_hat, tf.reduce_max(su_logits, 3))
                    sh_a = tf.tile(tf.expand_dims(sh_a, 2), [1, 1, JX, 1])
                    p00 = tf.concat(
                        axis=3,
                        values=[C_hat, su_a, C_hat * su_a, C_hat * sh_a])
                    print("p00", p00)  #[N,M,JX,8d]
                    p0 = p00
                    print('-' * 5 + "END RUMINATING LAYER" + '-' * 5)

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell_fw,
                first_cell_bw,
                p0,
                x_len,
                dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                second_cell_fw,
                second_cell_bw,
                g0,
                x_len,
                dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])

            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell4_fw,
                d_cell4_bw,
                tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)
            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            if config.na:
                na_bias = tf.get_variable("na_bias", shape=[], dtype='float')
                na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]),
                                        [N, 1])  # [N, 1]
                concat_flat_logits = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits])
                concat_flat_yp = tf.nn.softmax(concat_flat_logits)
                na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]),
                                     [1])
                flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1])

                concat_flat_logits2 = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits2])
                concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2)
                na_prob2 = tf.squeeze(
                    tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1])  # [N]
                flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1])

                self.concat_logits = concat_flat_logits
                self.concat_logits2 = concat_flat_logits2
                self.na_prob = na_prob * na_prob2

            yp = tf.reshape(flat_yp, [-1, M, JX])
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])
            wyp = tf.nn.sigmoid(logits2)

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
            self.wyp = wyp
Beispiel #8
0
    def _build_forward(self):
        config = self.config

        N = config.batch_size
        M = config.max_num_sents
        JX = config.max_sent_size
        JQ = config.max_ques_size
        VW = config.word_vocab_size
        VC = config.char_vocab_size
        W = config.max_word_size
        d = config.hidden_size

        JX = tf.shape(self.x)[2]  # JX max sentence size, length,
        JQ = tf.shape(self.q)[1]  # JQ max questions size, length, is the
        M = tf.shape(self.x)[1]  # m is the max number of sentences
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
        # dc = 8, each char will be map to 8-number vector,  "char-level word embedding size [100]"
        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')
                    # 330,8 a matrix for each char to its 8-number vector

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)
                    # [N, M, JX, W, dc] 60,None,None,16,8, batch-size,
                    # N is the number of batch_size
                    # M the max number of sentences
                    # JX is the max sentence length
                    # W is  the max length of a word
                    # dc is the vector for each char
                    # map each char to a vector

                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    # JQ the max length of question
                    # W the max length of words
                    # mao each char in questiosn to vectors

                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])
                    # max questions size, length, max_word_size(16), char_emb_size(8)

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    # so here, there are 100 filters and the size of each filter is 5
                    # different heights and there are different number of these filter, but here just 100 5-long filters

                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(
                            qq, [-1, JQ, dco
                                 ])  # here, xx and qq are the output of cnn,

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:  # create a new word embedding or use the glove?
                        word_emb_mat = tf.concat(
                            [word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq  # here we used cnn and word embedding represented each word with a 200-unit vector
        # so for, xx, (batch_size, sentence#, word#, embedding), qq (batch_size, word#, embedding)
        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq
        # same shape with line 173
        cell = BasicLSTMCell(
            d, state_is_tuple=True)  # d = 100, hidden state number
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'),
                              2)  # [N, M], [60,?]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N] [60]
        # masks are true and false, here, he sums up those truths,
        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell,
                                             d_cell,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(
                [fw_u, bw_u],
                2)  # (60, ?, 200) |  200 becahse combined 2 100 hidden states
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u  # [60, ?, 200] for question
            self.tensor_dict['h'] = h  # [60, ?, ?, 200] for article

        with tf.variable_scope("main"):
            if config.dynamic_att:  # todo what is this dynamic attention.
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                cell2 = BasicLSTMCell(
                    d, state_is_tuple=True)  # d = 100, hidden state number
                first_cell = SwitchableDropoutWrapper(
                    cell2,
                    self.is_train,
                    input_keep_prob=config.input_keep_prob)

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell,
                first_cell,
                inputs=p0,
                sequence_length=x_len,
                dtype='float',
                scope='g0')  # [N, M, JX, 2d]

            g0 = tf.concat([fw_g0, bw_g0], 3)
            cell3 = BasicLSTMCell(
                d, state_is_tuple=True)  # d = 100, hidden state number
            first_cell3 = SwitchableDropoutWrapper(
                cell3, self.is_train, input_keep_prob=config.input_keep_prob)

            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell3, first_cell3, g0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat([fw_g1, bw_g1], 3)

            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])
            cell4 = BasicLSTMCell(
                d, state_is_tuple=True)  # d = 100, hidden state number
            first_cell4 = SwitchableDropoutWrapper(
                cell4, self.is_train, input_keep_prob=config.input_keep_prob)

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                first_cell4,
                first_cell4,
                tf.concat([p0, g1, a1i, g1 * a1i], 3),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat([fw_g2, bw_g2], 3)
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            yp = tf.reshape(flat_yp, [-1, M, JX])
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
Beispiel #9
0
    def _build_forward(self):
        #config为预先配置好的参数等
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        #嵌入层
        with tf.variable_scope("emb"):
            #字符嵌入层
            if config.use_char_emb:  #若需要字符嵌入层
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    #CNN的滤波器参数
                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)

                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            #词嵌入层
            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:  #若调用已训练好的词嵌入文件
                        word_emb_mat = tf.concat(
                            0, [word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    #将文章主体context:x和问题query:q转换为词向量
                    #embedding_lookup(params, ids),根据ids寻找params中的第id行
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:  #若进行了字符嵌入,在指定维度上将字符嵌入和词嵌入进行拼接
                    xx = tf.concat(3, [xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(2, [qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # 经过两层highway network得到context vector∈ R^(d*T)和query vectorQ∈R^(d∗J)
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell = BasicLSTMCell(d, state_is_tuple=True)
        #SwitchableDropoutWrapper为自定义的DropoutWrapper类
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        #reduce_sum在指定的维度上求和(得到x和q的非空值总数),cast将输入的tensor映射到指定类型(此处为x_mask到int32)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        #Contextual Embedding Layer:对上一层得到的X和Q分别使用BiLSTM进行处理,分别捕捉X和Q中各自单词间的局部关系
        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell,
                                             d_cell,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            #fw_u和bw_u分别为双向lstm的output
            u = tf.concat(2, [fw_u, bw_u])  #[N, J, 2d]
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        #核心层Attention Flow Layer
        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                #expand_dims()在矩阵指定位置增加维度
                #tile()对矩阵的指定维度进行复制
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [
                    N * M, JQ, 2 * d
                ])  #先在索引1的位置添加一个维度,然后复制M(context中最多的sentence数量)次,使u和h能具有相同的维度
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(3, [fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, g0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(3, [fw_g1, bw_g1])

            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell,
                d_cell,
                tf.concat(3, [p0, g1, a1i, g1 * a1i]),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(3, [fw_g2, bw_g2])
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            yp = tf.reshape(flat_yp, [-1, M, JX])
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
Beispiel #10
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw],
                                                       initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len,
                                                                                 dtype='float',
                                                                                 scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float',
                                                            scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float',
                                                            scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d])
                q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ])
                first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim',
                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim',
                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim',
                                               input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim',
                                               input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
            else:
                p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float',
                                                          scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float',
                                                          scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])

        with tf.variable_scope("output"):
            if config.model_name == "basic":
                logits = get_logits([g1, p0], d, True, wd=config.wd,
                                    input_keep_prob=config.input_keep_prob,
                                    mask=self.x_mask, is_train=self.is_train,
                                    func=config.answer_func, scope='logits1')
                a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                              tf.reshape(logits, [N, M * JX]))
                a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                              [1, M, JX, 1])
                (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw,
                                                              tf.concat([p0, g1, a1i, g1 * a1i], 3),
                                                              x_len, dtype='float', scope='g2')  # [N, M, JX, 2d]
                g2 = tf.concat([fw_g2, bw_g2], 3)
                logits2 = get_logits([g2, p0], d, True, wd=config.wd,
                                     input_keep_prob=config.input_keep_prob, mask=self.x_mask,
                                     is_train=self.is_train, func=config.answer_func,
                                     scope='logits2')
                flat_logits = tf.reshape(logits, [-1, M * JX])
                flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
                yp = tf.reshape(flat_yp, [-1, M, JX])
                flat_logits2 = tf.reshape(logits2, [-1, M * JX])
                flat_yp2 = tf.nn.softmax(flat_logits2)
                yp2 = tf.reshape(flat_yp2, [-1, M, JX])

                self.tensor_dict['g1'] = g1
                self.tensor_dict['g2'] = g2

                self.logits = flat_logits
                self.logits2 = flat_logits2
                self.yp = yp
                self.yp2 = yp2

            elif config.model_name == "basic-class":
                C = 3 if config.data_dir.startswith('data/snli') else 2
                (fw_g2, bw_g2) = (fw_g1, bw_g1)

                if config.classifier == 'maxpool':
                    g2 = tf.concat([fw_g2, bw_g2], 3)  # [N, M, JX, 2d]
                    g2 = tf.reduce_max(g2, 2)  # [N, M, 2d]
                    g2_dim = 2 * d
                elif config.classifier == 'sumpool':
                    g2 = tf.concat([fw_g2, bw_g2], 3)
                    g2 = tf.reduce_sum(g2, 2)
                    g2_dim = 2 * d
                else:
                    fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]), JX - 1)
                    bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0)
                    g2 = tf.concat([fw_g2_, bw_g2_], 2)
                    g2_dim = 2 * d

                g2_ = tf.reshape(g2, [N, g2_dim])

                logits0 = linear(g2_, C, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
                                 is_train=self.is_train, scope='classifier')
                flat_yp0 = tf.nn.softmax(logits0)
                yp0 = tf.reshape(flat_yp0, [N, M, C])
                self.tensor_dict['g1'] = g1
                self.logits0 = logits0
                self.yp0 = yp0
                self.logits = logits0
                self.yp = yp0
Beispiel #11
0
 def highway(self, X, name=""):
     return highway_network(X, 2, True, is_train=True, scope=name)
Beispiel #12
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        print("parameters", N, M, JX, JQ, VW, VC, d, W)
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx, filter_sizes, heights, "VALID",  self.is_train, config.keep_prob, scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
                        else:
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        # prepro layer cell
        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell_fwh = BasicLSTMCell(d, state_is_tuple=True)
        cell_bwh = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fwh = SwitchableDropoutWrapper(cell_fwh, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bwh = SwitchableDropoutWrapper(cell_bwh, self.is_train, input_keep_prob=config.input_keep_prob)

        # attention layer cell
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)

        # out layer cell_bw
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_h, bw_h), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1')
            h = tf.concat(axis=3, values=[fw_h, bw_h])

            (fw_u, bw_u), ((_, fw_h_f), (_, bw_h_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])

            
            # if config.share_lstm_weights:
            #     tf.get_variable_scope().reuse_variables()
            #     (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1')  # [N, M, JX, 2d]
            #     h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            # else:
            #     (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1')  # [N, M, JX, 2d]
            #     h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]

            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h
            # h = tf.Print(h, [tf.reduce_max(h), tf.reduce_min(h), "h"], summarize=1000)

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d])
                q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ])
                first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim',
                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim',
                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim',
                                            input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim',
                                               input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
            else:
                p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict)
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            # p0 = tf.Print(p0, [p0, "lstm output-1:"],  summarize=200)

            # two layer LSTM
            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            # g0 = tf.Print(g0, [g0, "lstm output0:"],  summarize=200)
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])
            # g1 = tf.Print(g1, [g1, "lstm output:"],  summarize=200)
            

            # output through a denselayer
        with tf.variable_scope("output"):
            #lstm_out = g1[:,0,-1,:]
            lstm_out = tf.reduce_sum(g1, axis = 2)
            lstm_out = tf.reshape(lstm_out, [-1, 2*d])
            lstm_out = tf.nn.dropout(lstm_out, 0.5)
            #logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
            #                    mask=self.x_mask, is_train=self.is_train, func='linear', scope='logits1')
            #print (logits)
            #lstm_out.set_shape([N, M*JX*2*d])
            dense1 = tf.layers.dense(inputs=lstm_out, units=64, activation = tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003))
            score = tf.layers.dense(inputs=dense1, units = 2, activation = None, kernel_regularizer=tf.contrib.layers.l2_regularizer(0.003))
            self.probs = tf.nn.softmax(score)

            # self.tensor_dict['g1'] = g1
            # self.tensor_dict['g2'] = g2
            self.score = score
Beispiel #13
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size
        JX = tf.shape(self.x)[2]  # words
        JQ = tf.shape(self.q)[1] # words
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        #print ("dhruv is here",N, self.x.get_shape(), JX, self.q.get_shape(), VW, VC, d, W,dc, dw, dco)
        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx, filter_sizes, heights, "VALID",  self.is_train, config.keep_prob, scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
                        else:
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded))
                        entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0))
                        word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[WOW, dw], initializer=get_initializer(config.emb_mat))
                        word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out])
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')

                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])
                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size]
                    Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d] i.e. [batch size, max words, embedding size]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq
        #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20)
        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att: # not true
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d])
                q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ])
                first_cell_fw = AttentionCell(cell2_fw, u, mask=q_mask, mapper='sim',
                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                first_cell_bw = AttentionCell(cell2_bw, u, mask=q_mask, mapper='sim',
                                              input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                second_cell_fw = AttentionCell(cell3_fw, u, mask=q_mask, mapper='sim',
                                            input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
                second_cell_bw = AttentionCell(cell3_bw, u, mask=q_mask, mapper='sim',
                                               input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
            else:
                p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper

            g1= tf.Print(g1,[tf.shape(g1)],message="g1 shape",first_n=5,summarize=200)
            p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200)

            my_cell_fw = BasicLSTMCell(d, state_is_tuple=True)
            my_cell_fw_d = SwitchableDropoutWrapper(my_cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
            my_cell_bw = BasicLSTMCell(d, state_is_tuple=True)
            my_cell_bw_d = SwitchableDropoutWrapper(my_cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)

            (fw_g11,bw_g11),(my_fw_final_state, my_bw_final_state),g11_len = my_bidirectional_dynamic_rnn(my_cell_fw_d, my_cell_bw_d, g1, x_len, dtype='float', scope='my_g2')  # [N, M, JX, 2d]
            g11 = tf.concat(axis=2, values=[fw_g11, bw_g11])

            my_encoder_final_state_c = tf.concat(values = (my_fw_final_state.c, my_bw_final_state.c), axis = 1, name = "my_encoder_final_state_c")
            my_encoder_final_state_h = tf.concat(values = (my_fw_final_state.h, my_bw_final_state.h), axis = 1, name = "my_encoder_final_state_h")
            my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c = my_encoder_final_state_c, h = my_encoder_final_state_h)

            #compute indices for finding span as the second task in multi task learning
            logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1')
            logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200)
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell4_fw, d_cell4_bw,
                                                          tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
                                                          x_len, dtype='float', scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
            logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train, func=config.answer_func, scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits),flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200)
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)

            tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently?
            print("length is",config.len_new_emb_mat)
            tgt_embedding_size = dw # hparam

            # Look up embedding
            decoder_emb_inp = tf.nn.embedding_lookup(word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size]



            def decode_with_attention(helper, scope, reuse=None,maximum_iterations=None):
                with tf.variable_scope(scope, reuse=reuse):
                    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=d, memory=g11)
                    cell = tf.contrib.rnn.GRUCell(num_units=d)
                    attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism,attention_layer_size=d /2)
                    out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, tgt_vocab_size, reuse=reuse)
                    decoder = tf.contrib.seq2seq.BasicDecoder(cell=out_cell, helper=helper,initial_state=out_cell.zero_state(
                                                                  dtype=tf.float32, batch_size=N))
                    # initial_state=encoder_final_state)
                    outputs = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, output_time_major=False,
                                                                impute_finished=True, maximum_iterations=maximum_iterations)
                    return outputs[0]

            def decode(helper, scope, reuse=None, maximum_iterations=None):
                with tf.variable_scope(scope, reuse=reuse):
                    decoder_cell = BasicLSTMCell(2 * d, state_is_tuple=True) # hparam
                    projection_layer = layers_core.Dense(tgt_vocab_size, use_bias=False) # hparam
                    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, my_encoder_final_state,output_layer=projection_layer) # decoder
                    final_outputs, _ ,_= tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True,
                                                                         maximum_iterations=maximum_iterations) # dynamic decoding
                    return final_outputs
            # Decoder
            if config.mode == 'train': #TODO:doesnt seem to be correct to use this variable for dev
                training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, self.target_sequence_length,time_major=False)
                #final_outputs = decode(helper=training_helper, scope="HAHA", reuse=None)
                final_outputs = decode_with_attention(helper=training_helper, scope="HAHA", reuse=None)

            else:
                training_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(word_emb_mat, tf.fill([N], self.tgt_sos_id),self.tgt_eos_id)
                #final_outputs= decode(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100)
                final_outputs= decode_with_attention(helper=training_helper, scope="HAHA", reuse=True,maximum_iterations=100)

            self.decoder_logits_train = final_outputs.rnn_output
            self.index_start = flat_logits
            self.index_end = flat_logits2
Beispiel #14
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        JA = config.max_answer_length
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        print("VW:", VW, "N:", N, "M:", M, "JX:", JX, "JQ:", JQ)
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            # Char-CNN Embedding
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(map(int, config.out_channel_dims.split(','))) # [100]
                    heights = list(map(int, config.filter_heights.split(','))) # [5]
                    assert sum(filter_sizes) == dco, (filter_sizes, dco) # Make sure filter channels = char_cnn_out size
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx, filter_sizes, heights, "VALID",  self.is_train, config.keep_prob, scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx")
                        else:
                            qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            # Word Embedding
            if config.use_word_emb:
                with tf.variable_scope("emb_var") as scope, tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float')
                    tf.get_variable_scope().reuse_variables()
                    self.word_emb_scope = scope
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat([word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                # Concat Char-CNN Embedding and Word Embedding
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

            # exact match
            if config.use_exact_match: # TODO: What does it mean?
                emx = tf.expand_dims(tf.cast(self.emx, tf.float32), -1)
                xx = tf.concat([xx, emx], 3)  # [N, M, JX, di+1]
                emq = tf.expand_dims(tf.cast(self.emq, tf.float32), -1)
                qq = tf.concat([qq, emq], 2)  # [N, JQ, di+1]


        # 2 layer highway network on Concat Embedding
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        # Bidirection-LSTM (3rd layer on paper)
        cell = GRUCell(d) if config.GRU else BasicLSTMCell(d, state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]
        flat_x_len = flatten(x_len, 0)  # [N * M]

        with tf.variable_scope("prepro"):
            if config.use_fused_lstm: #yes
                with tf.variable_scope("u1"):
                    fw_inputs = tf.transpose(qq, [1, 0, 2]) #[time_len, batch_size, input_size]
                    bw_inputs = tf.reverse_sequence(fw_inputs, q_len, batch_dim=1, seq_dim=0)
                    fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob)
                    bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob)
                    prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                    prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                    fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=q_len, scope="fw")
                    bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=q_len, scope="bw")
                    bw_outputs = tf.reverse_sequence(bw_outputs, q_len, batch_dim=1, seq_dim = 0)
                    current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                    output = tf.transpose(current_inputs, [1, 0, 2])
                    u = output
                flat_xx = flatten(xx, 2)  # [N * M, JX, d]
                if config.share_lstm_weights: # Yes
                    tf.get_variable_scope().reuse_variables()
                    with tf.variable_scope("u1"):
                        fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size]
                        bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0)
                        # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob)
                        # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob)
                        fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw")
                        bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw")
                        bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0)
                        current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                        output = tf.transpose(current_inputs, [1, 0, 2])
                else: # No
                    with tf.variable_scope("h1"):
                        fw_inputs = tf.transpose(flat_xx, [1, 0, 2]) #[time_len, batch_size, input_size]
                        bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0)
                        # fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob)
                        # bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob)
                        prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        fw_outputs, fw_final = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="fw")
                        bw_outputs, bw_final = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len, scope="bw")
                        bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0)
                        current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                        output = tf.transpose(current_inputs, [1, 0, 2])
                h = tf.expand_dims(output, 1) # [N, M, JX, 2d]
            else:
                (fw_u, bw_u), _ = bidirectional_dynamic_rnn(d_cell, d_cell, qq, q_len, dtype='float', scope='u1')  # [N, J, d], [N, d]
                u = tf.concat([fw_u, bw_u], 2)
                if config.share_lstm_weights:
                    tf.get_variable_scope().reuse_variables()
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='u1')  # [N, M, JX, 2d]
                    h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
                else:
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='h1')  # [N, M, JX, 2d]
                    h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u # hidden state of Q = u
            self.tensor_dict['h'] = h # hidden state of C = h

        # Attention Flow Layer (4th layer on paper)
        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d])
                q_mask = tf.reshape(tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ])
                first_cell = AttentionCell(cell, u, size=d, mask=q_mask, mapper='sim',
                                           input_keep_prob=self.config.input_keep_prob, is_train=self.is_train)
            else:
                p0 = attention_layer(config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict)
                first_cell = d_cell # a GRU cell with dropout wrapper
            tp0 = p0 # Output of Attention layer

        # Modeling layer (5th layer on paper)
        with tf.variable_scope('modeling_layer'):
            if config.use_fused_lstm:
                g1, encoder_state_final = build_fused_bidirectional_rnn(inputs=p0,
                                                                        num_units=config.hidden_size,
                                                                        num_layers=config.num_modeling_layers,
                                                                        inputs_length=flat_x_len,
                                                                        input_keep_prob=config.input_keep_prob,
                                                                        scope='modeling_layer_g')

            else:
                for layer_idx in range(config.num_modeling_layers-1):
                    (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len,
                                                                  dtype='float', scope="g_{}".format(layer_idx))  # [N, M, JX, 2d]
                    p0 = tf.concat([fw_g0, bw_g0], 3)
                (fw_g1, bw_g1), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, p0, x_len,
                                                                             dtype='float', scope='g1')  # [N, M, JX, 2d]
                g1 = tf.concat([fw_g1, bw_g1], 3)  # [N, M, JX, 2d]

        # Self match layer
        if config.use_self_match:
            s0 = tf.reshape(g1, [N * M, JX, 2 * d])                     # [N * M, JX, 2d]
            x_mask = tf.reshape(self.x_mask, [N * M, JX])               # [N * M, JX]
            if config.use_static_self_match:
                with tf.variable_scope("StaticSelfMatch"):              # implemented follow r-net section 3.3
                    W_x_Vj = tf.contrib.layers.fully_connected(         # [N * M, JX, d]
                        s0, int(d / 2), scope='row_first',
                        activation_fn=None, biases_initializer=None
                    )
                    W_x_Vt = tf.contrib.layers.fully_connected(         # [N * M, JX, d]
                        s0, int(d / 2), scope='col_first',
                        activation_fn=None, biases_initializer=None
                    )
                    sum_rc = tf.add(                                    # [N * M, JX, JX, d]
                        tf.expand_dims(W_x_Vj, 1),
                        tf.expand_dims(W_x_Vt, 2)
                    )
                    v = tf.get_variable('second', shape=[1, 1, 1, int(d / 2)], dtype=tf.float32)
                    Sj = tf.reduce_sum(tf.multiply(v, tf.tanh(sum_rc)), -1)     # [N * M, JX, JX]
                    Ai = softmax(Sj, mask = tf.expand_dims(x_mask, 1))          # [N * M, JX, JX]
                    Ai = tf.expand_dims(Ai, -1)                                 # [N * M, JX, JX, 1]
                    Vi = tf.expand_dims(s0, 1)                                  # [N * M, 1, JX, 2d]
                    Ct = tf.reduce_sum(                                         # [N * M, JX, 2d]
                        tf.multiply(Ai, Vi),
                        axis = 2
                    )
                    inputs_Vt_Ct = tf.concat([s0, Ct], 2)                       # [N * M, JX, 4d]
                    if config.use_fused_lstm:
                        fw_inputs = tf.transpose(inputs_Vt_Ct, [1, 0, 2])  # [time_len, batch_size, input_size]
                        bw_inputs = tf.reverse_sequence(fw_inputs, flat_x_len, batch_dim=1, seq_dim=0)
                        fw_inputs = tf.nn.dropout(fw_inputs, config.input_keep_prob)
                        bw_inputs = tf.nn.dropout(bw_inputs, config.input_keep_prob)
                        prep_fw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        prep_bw_cell = LSTMBlockFusedCell(d, cell_clip=0)
                        fw_outputs, fw_s_f = prep_fw_cell(fw_inputs, dtype=tf.float32, sequence_length=flat_x_len,
                                                            scope="fw")
                        bw_outputs, bw_s_f = prep_bw_cell(bw_inputs, dtype=tf.float32, sequence_length=flat_x_len,
                                                            scope="bw")
                        fw_s_f = LSTMStateTuple(c=fw_s_f[0], h=fw_s_f[1])
                        bw_s_f = LSTMStateTuple(c=bw_s_f[0], h=bw_s_f[1])
                        bw_outputs = tf.reverse_sequence(bw_outputs, flat_x_len, batch_dim=1, seq_dim=0)
                        current_inputs = tf.concat((fw_outputs, bw_outputs), 2)
                        s1 = tf.transpose(current_inputs, [1, 0, 2])
                    else:
                        (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, inputs_Vt_Ct,
                                                                                   flat_x_len, dtype='float',
                                                                                   scope='s')  # [N, M, JX, 2d]
                        s1 = tf.concat([fw_s, bw_s], 2)  # [N * M, JX, 2d], M == 1
            else:
                with tf.variable_scope("DynamicSelfMatch"):
                    first_cell = AttentionCell(cell, s0, size=d, mask=x_mask, is_train=self.is_train)
                    (fw_s, bw_s), (fw_s_f, bw_s_f) = bidirectional_dynamic_rnn(first_cell, first_cell, s0, x_len,
                                                                               dtype='float', scope='s')  # [N, M, JX, 2d]
                    s1 = tf.concat([fw_s, bw_s], 2)  # [N * M, JX, 2d], M == 1
            g1 = tf.expand_dims(s1, 1) # [N, M, JX, 2d]

        # prepare for PtrNet
        encoder_output = g1  # [N, M, JX, 2d]
        encoder_output = tf.expand_dims(tf.cast(self.x_mask, tf.float32), -1) * encoder_output  # [N, M, JX, 2d]

        if config.use_self_match or not config.use_fused_lstm:
            if config.GRU:
                encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat')
            else:
                if isinstance(fw_s_f, LSTMStateTuple):
                    encoder_state_c = tf.concat(
                        (fw_s_f.c, bw_s_f.c), 1, name='encoder_concat_c')
                    encoder_state_h = tf.concat(
                        (fw_s_f.h, bw_s_f.h), 1, name='encoder_concat_h')
                    encoder_state_final = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
                elif isinstance(fw_s_f, tf.Tensor):
                    encoder_state_final = tf.concat((fw_s_f, bw_s_f), 1, name='encoder_concat')
                else:
                    encoder_state_final = None
                    tf.logging.error("encoder_state_final not set")

        print("encoder_state_final:", encoder_state_final)

        with tf.variable_scope("output"):
            # eos_symbol = config.eos_symbol
            # next_symbol = config.next_symbol

            tf.assert_equal(M, 1)  # currently dynamic M is not supported, thus we assume M==1
            answer_string = tf.placeholder(
                shape=(N, 1, JA + 1),
                dtype=tf.int32,
                name='answer_string'
            )  # [N, M, JA + 1]
            answer_string_mask = tf.placeholder(
                shape=(N, 1, JA + 1),
                dtype=tf.bool,
                name='answer_string_mask'
            )  # [N, M, JA + 1]
            answer_string_length = tf.placeholder(
                shape=(N, 1),
                dtype=tf.int32,
                name='answer_string_length',
            ) # [N, M]
            self.tensor_dict['answer_string'] = answer_string
            self.tensor_dict['answer_string_mask'] = answer_string_mask
            self.tensor_dict['answer_string_length'] = answer_string_length
            self.answer_string = answer_string
            self.answer_string_mask = answer_string_mask
            self.answer_string_length = answer_string_length

            answer_string_flattened = tf.reshape(answer_string, [N * M, JA + 1])
            self.answer_string_flattened = answer_string_flattened  # [N * M, JA+1]
            print("answer_string_flattened:", answer_string_flattened)

            answer_string_length_flattened = tf.reshape(answer_string_length, [N * M])
            self.answer_string_length_flattened = answer_string_length_flattened  # [N * M]
            print("answer_string_length_flattened:", answer_string_length_flattened)

            decoder_cell = GRUCell(2 * d) if config.GRU else BasicLSTMCell(2 * d, state_is_tuple=True)

            with tf.variable_scope("Decoder"):
                decoder_train_logits = ptr_decoder(decoder_cell,
                                                   tf.reshape(tp0, [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                                                   tf.reshape(encoder_output, [N * M, JX, 2 * d]),  # [N * M, JX, 2d]
                                                   flat_x_len,
                                                   encoder_final_state=encoder_state_final,
                                                   max_encoder_length=config.sent_size_th,
                                                   decoder_output_length=answer_string_length_flattened,  # [N * M]
                                                   batch_size=N,  # N * M (M=1)
                                                   attention_proj_dim=self.config.decoder_proj_dim,
                                                   scope='ptr_decoder')  # [batch_size, dec_len*, enc_seq_len + 1]

                self.decoder_train_logits = decoder_train_logits
                print("decoder_train_logits:", decoder_train_logits)
                self.decoder_train_softmax = tf.nn.softmax(self.decoder_train_logits)
                self.decoder_inference = tf.argmax(decoder_train_logits, axis=2,
                                                   name='decoder_inference')  # [N, JA + 1]

            self.yp = tf.ones([N, M, JX], dtype=tf.int32) * -1
            self.yp2 = tf.ones([N, M, JX], dtype=tf.int32) * -1
Beispiel #15
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:  # 计算字符emb
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int,
                            config.out_channel_dims.split(',')))  # TODO What?
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            with tf.variable_scope(tf.get_variable_scope(),
                                                   reuse=True):
                                qq = multi_conv1d(Acq,
                                                  filter_sizes,
                                                  heights,
                                                  "VALID",
                                                  self.is_train,
                                                  config.keep_prob,
                                                  scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(
                                config.emb_mat))  # emb_mat is glove
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            [word_emb_mat, self.new_emb_mat], 0)

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat([xx, Ax], 3)  # [N, M, JX, di]
                    qq = tf.concat([qq, Aq], 2)  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    qq = highway_network(qq,
                                         config.highway_num_layers,
                                         True,
                                         wd=config.wd,
                                         is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell = BasicLSTMCell(d, state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell,
                                             d_cell,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat([fw_u, bw_u], 2)
            if config.share_lstm_weights:
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell, cell, xx, x_len, dtype='float',
                        scope='u1')  # [N, M, JX, 2d] TODO JX == x_len?
                    h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat([fw_h, bw_h], 3)  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat([fw_g0, bw_g0], 3)
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, g0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat([fw_g1, bw_g1], 3)

            # Output Layer
            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell,
                d_cell,
                tf.concat([p0, g1, a1i, g1 * a1i], 3),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat([fw_g2, bw_g2], 3)
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            yp = tf.reshape(flat_yp, [-1, M, JX])
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
Beispiel #16
0
    def __init__(self,
                 config,
                 seq_length,
                 emb_dim,
                 hidden_dim,
                 emb_train,
                 embeddings=None,
                 pred_size=3,
                 context_seq_len=None,
                 query_seq_len=None):
        ## Define hyperparameters
        # tf.reset_default_graph()
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length
        self.pred_size = pred_size
        self.context_seq_len = context_seq_len
        self.query_seq_len = query_seq_len
        # self.config = config

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length],
                                        name='premise')
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length],
                                           name='hypothesis')
        self.premise_pos = tf.placeholder(tf.int32,
                                          [None, self.sequence_length, 47],
                                          name='premise_pos')
        self.hypothesis_pos = tf.placeholder(tf.int32,
                                             [None, self.sequence_length, 47],
                                             name='hypothesis_pos')
        self.premise_char = tf.placeholder(
            tf.int32, [None, self.sequence_length, config.char_in_word_size],
            name='premise_char')
        self.hypothesis_char = tf.placeholder(
            tf.int32, [None, self.sequence_length, config.char_in_word_size],
            name='hypothesis_char')
        self.premise_exact_match = tf.placeholder(
            tf.int32, [None, self.sequence_length, 1],
            name='premise_exact_match')
        self.hypothesis_exact_match = tf.placeholder(
            tf.int32, [None, self.sequence_length, 1],
            name='hypothesis_exact_match')

        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        self.dropout_keep_rate = tf.train.exponential_decay(
            config.keep_rate,
            self.global_step,
            config.dropout_decay_step,
            config.dropout_decay_rate,
            staircase=False,
            name='dropout_keep_rate')
        config.keep_rate = self.dropout_keep_rate
        tf.summary.scalar('dropout_keep_rate', self.dropout_keep_rate)

        self.y = tf.placeholder(tf.int32, [None], name='label_y')
        self.keep_rate_ph = tf.placeholder(tf.float32, [], name='keep_prob')
        self.is_train = tf.placeholder('bool', [], name='is_train')

        ## Fucntion for embedding lookup and dropout at embedding layer
        def emb_drop(E, x):
            emb = tf.nn.embedding_lookup(E, x)
            emb_drop = tf.cond(self.is_train,
                               lambda: tf.nn.dropout(emb, config.keep_rate),
                               lambda: emb)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(
            self.premise_x)  # mask [N, L , 1]
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)
        self.prem_mask = prem_mask
        self.hyp_mask = hyp_mask

        ### Embedding layer ###
        with tf.variable_scope("emb"):
            with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                self.E = tf.Variable(embeddings, trainable=emb_train)
                premise_in = emb_drop(self.E, self.premise_x)  #P
                hypothesis_in = emb_drop(self.E, self.hypothesis_x)  #H

        with tf.variable_scope("char_emb"):
            char_emb_mat = tf.get_variable(
                "char_emb_mat",
                shape=[config.char_vocab_size, config.char_emb_size])
            with tf.variable_scope("char") as scope:
                char_pre = tf.nn.embedding_lookup(char_emb_mat,
                                                  self.premise_char)
                char_hyp = tf.nn.embedding_lookup(char_emb_mat,
                                                  self.hypothesis_char)

                filter_sizes = list(
                    map(int, config.out_channel_dims.split(',')))  #[100]
                heights = list(map(int,
                                   config.filter_heights.split(',')))  #[5]
                assert sum(filter_sizes) == config.char_out_size, (
                    filter_sizes, config.char_out_size)
                with tf.variable_scope("conv") as scope:
                    conv_pre = multi_conv1d(char_pre,
                                            filter_sizes,
                                            heights,
                                            "VALID",
                                            self.is_train,
                                            config.keep_rate,
                                            scope='conv')
                    scope.reuse_variables()
                    conv_hyp = multi_conv1d(char_hyp,
                                            filter_sizes,
                                            heights,
                                            "VALID",
                                            self.is_train,
                                            config.keep_rate,
                                            scope='conv')
                    conv_pre = tf.reshape(
                        conv_pre,
                        [-1, self.sequence_length, config.char_out_size])
                    conv_hyp = tf.reshape(
                        conv_hyp,
                        [-1, self.sequence_length, config.char_out_size])
            premise_in = tf.concat([premise_in, conv_pre], axis=2)
            hypothesis_in = tf.concat([hypothesis_in, conv_hyp], axis=2)

        premise_in = tf.concat(
            (premise_in, tf.cast(self.premise_pos, tf.float32)), axis=2)
        hypothesis_in = tf.concat(
            (hypothesis_in, tf.cast(self.hypothesis_pos, tf.float32)), axis=2)

        premise_in = tf.concat(
            [premise_in,
             tf.cast(self.premise_exact_match, tf.float32)],
            axis=2)
        hypothesis_in = tf.concat(
            [hypothesis_in,
             tf.cast(self.hypothesis_exact_match, tf.float32)],
            axis=2)

        with tf.variable_scope("highway") as scope:
            premise_in = highway_network(premise_in,
                                         config.highway_num_layers,
                                         True,
                                         wd=config.wd,
                                         is_train=self.is_train)
            scope.reuse_variables()
            hypothesis_in = highway_network(hypothesis_in,
                                            config.highway_num_layers,
                                            True,
                                            wd=config.wd,
                                            is_train=self.is_train)

        with tf.variable_scope("prepro") as scope:
            pre = premise_in
            hyp = hypothesis_in
            for i in range(config.self_att_enc_layers):
                with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                    p = self_attention_layer(
                        config,
                        self.is_train,
                        pre,
                        p_mask=prem_mask,
                        scope="{}_layer_self_att_enc".format(
                            i))  # [N, len, dim]
                    h = self_attention_layer(
                        config,
                        self.is_train,
                        hyp,
                        p_mask=hyp_mask,
                        scope="{}_layer_self_att_enc_h".format(i))
                    pre = p
                    hyp = h
                    variable_summaries(p,
                                       "p_self_enc_summary_layer_{}".format(i))
                    variable_summaries(h,
                                       "h_self_enc_summary_layer_{}".format(i))

        with tf.variable_scope("main") as scope:

            def model_one_side(config, main, support, main_length,
                               support_length, main_mask, support_mask, scope):
                bi_att_mx = bi_attention_mx(config,
                                            self.is_train,
                                            main,
                                            support,
                                            p_mask=main_mask,
                                            h_mask=support_mask)  # [N, PL, HL]

                bi_att_mx = tf.cond(
                    self.is_train,
                    lambda: tf.nn.dropout(bi_att_mx, config.keep_rate),
                    lambda: bi_att_mx)
                out_final = dense_net(config, bi_att_mx, self.is_train)

                return out_final

            premise_final = model_one_side(config,
                                           p,
                                           h,
                                           prem_seq_lengths,
                                           hyp_seq_lengths,
                                           prem_mask,
                                           hyp_mask,
                                           scope="premise_as_main")
            f0 = premise_final
            print('f0:', f0.get_shape().as_list())

        self.logits = linear(f0,
                             self.pred_size,
                             True,
                             bias_start=0.0,
                             scope="logit",
                             squeeze=False,
                             wd=config.wd,
                             input_keep_prob=config.keep_rate,
                             is_train=self.is_train)

        tf.summary.histogram('logit_histogram', self.logits)

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
        self.acc = tf.reduce_mean(
            tf.cast(
                tf.equal(tf.arg_max(self.logits, dimension=1),
                         tf.cast(self.y, tf.int64)), tf.float32))
        tf.summary.scalar('acc', self.acc)

        tf.summary.scalar('loss', self.total_cost)

        # calculate acc

        # L2 Loss
        if config.l2_loss:
            if config.sigmoid_growing_l2loss:
                weights_added = tf.add_n([
                    tf.nn.l2_loss(tensor)
                    for tensor in tf.trainable_variables()
                    if tensor.name.endswith("weights:0")
                    and not tensor.name.endswith("weighted_sum/weights:0")
                    or tensor.name.endswith('kernel:0')
                ])
                full_l2_step = tf.constant(config.weight_l2loss_step_full_reg,
                                           dtype=tf.int32,
                                           shape=[],
                                           name='full_l2reg_step')
                full_l2_ratio = tf.constant(config.l2_regularization_ratio,
                                            dtype=tf.float32,
                                            shape=[],
                                            name='l2_regularization_ratio')
                gs_flt = tf.cast(self.global_step, tf.float32)
                half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32)

                # (self.global_step - full_l2_step / 2)
                # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)
                # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio
                l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) /
                                          half_l2_step_flt) * full_l2_ratio
                tf.summary.scalar('l2loss_ratio', l2loss_ratio)
                l2loss = weights_added * l2loss_ratio
            else:
                l2loss = tf.add_n([
                    tf.nn.l2_loss(tensor)
                    for tensor in tf.trainable_variables() if tensor.name.
                    endswith("weights:0") or tensor.name.endswith('kernel:0')
                ]) * tf.constant(config.l2_regularization_ratio,
                                 dtype='float',
                                 shape=[],
                                 name='l2_regularization_ratio')
            tf.summary.scalar('l2loss', l2loss)
            self.total_cost += l2loss

        if config.wo_enc_sharing or config.wo_highway_sharing_but_penalize_diff:
            diffs = []
            for i in range(config.self_att_enc_layers):
                for tensor in tf.trainable_variables():
                    print(tensor.name)
                    if tensor.name == "prepro/{}_layer_self_att_enc/self_attention/h_logits/first/kernel:0".format(
                            i):
                        l_lg = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_attention/h_logits/first/kernel:0".format(
                            i):
                        r_lg = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_1/kernel:0".format(
                            i):
                        l_fg_lhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_1/kernel:0".format(
                            i):
                        r_fg_lhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_1/kernel:0".format(
                            i):
                        l_fg_rhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_1/kernel:0".format(
                            i):
                        r_fg_rhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_2/kernel:0".format(
                            i):
                        l_fg_lhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_2/kernel:0".format(
                            i):
                        r_fg_lhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_2/kernel:0".format(
                            i):
                        l_fg_rhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_2/kernel:0".format(
                            i):
                        r_fg_rhs_2 = tensor

                    if config.two_gate_fuse_gate:
                        if tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_3/kernel:0".format(
                                i):
                            l_fg_lhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_3/kernel:0".format(
                                i):
                            r_fg_lhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_3/kernel:0".format(
                                i):
                            l_fg_rhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_3/kernel:0".format(
                                i):
                            r_fg_rhs_3 = tensor

                diffs += [
                    l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1,
                    l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2,
                    l_fg_rhs_2 - r_fg_rhs_2
                ]
                if config.two_gate_fuse_gate:
                    diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3]

            diff_loss = tf.add_n([tf.nn.l2_loss(tensor)
                                  for tensor in diffs]) * tf.constant(
                                      config.diff_penalty_loss_ratio,
                                      dtype='float',
                                      shape=[],
                                      name='diff_penalty_loss_ratio')
            tf.summary.scalar('diff_penalty_loss', diff_loss)
            self.total_cost += diff_loss

        self.summary = tf.summary.merge_all()

        total_parameters = 0
        for v in tf.global_variables():
            if not v.name.endswith("weights:0") and not v.name.endswith(
                    "biases:0") and not v.name.endswith(
                        'kernel:0') and not v.name.endswith('bias:0'):
                continue
            print(v.name)
            # print(type(v.name))
            shape = v.get_shape().as_list()
            param_num = 1
            for dim in shape:
                param_num *= dim
            print(param_num)
            total_parameters += param_num
        print(total_parameters)
Beispiel #17
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            0, [word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(3, [xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(2, [qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell = BasicLSTMCell(d, state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell,
                                             d_cell,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(2, [fw_u, bw_u])
            if config.two_prepro_layers:
                (fw_u, bw_u), ((_, fw_u_f),
                               (_, bw_u_f)) = bidirectional_dynamic_rnn(
                                   d_cell,
                                   d_cell,
                                   u,
                                   q_len,
                                   dtype='float',
                                   scope='u2')  # [N, J, d], [N, d]
                u = tf.concat(2, [fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
                if config.two_prepro_layers:
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell, cell, h, x_len, dtype='float',
                        scope='u2')  # [N, M, JX, 2d]
                    h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]

            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
                if config.two_prepro_layers:
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell, cell, h, x_len, dtype='float',
                        scope='h2')  # [N, M, JX, 2d]
                    h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell
            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(3, [fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, g0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(3, [fw_g1, bw_g1])

            if config.late:
                (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                    d_cell,
                    d_cell,
                    tf.concat(3, [g1, p0]),
                    x_len,
                    dtype='float',
                    scope='g2')  # [N, M, JX, 2d]
                g2 = tf.concat(3, [fw_g2, bw_g2])
                # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2")

                logits = get_logits([g1, g2, p0],
                                    d,
                                    True,
                                    wd=config.wd,
                                    input_keep_prob=config.input_keep_prob,
                                    mask=self.x_mask,
                                    is_train=self.is_train,
                                    func=config.answer_func,
                                    scope='logits1')

                if config.feed_gt:
                    logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER)
                    logits = tf.cond(self.is_train, lambda: logy,
                                     lambda: logits)
                if config.feed_hard:
                    hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1)
                    hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX),
                                             [N, M, JX])  # [N, M, JX]
                    logits = tf.cond(self.is_train, lambda: logits,
                                     lambda: hard_logits)

                flat_logits = tf.reshape(logits, [-1, M * JX])
                flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
                yp = tf.reshape(flat_yp, [-1, M, JX])

                logits2 = get_logits([g1, g2, p0],
                                     d,
                                     True,
                                     wd=config.wd,
                                     input_keep_prob=config.input_keep_prob,
                                     mask=self.x_mask,
                                     is_train=self.is_train,
                                     func=config.answer_func,
                                     scope='logits2')

                flat_logits2 = tf.reshape(logits2, [-1, M * JX])
                flat_yp2 = tf.nn.softmax(flat_logits2)
                yp2 = tf.reshape(flat_yp2, [-1, M, JX])
            else:
                logits = get_logits([g1, p0],
                                    d,
                                    True,
                                    wd=config.wd,
                                    input_keep_prob=config.input_keep_prob,
                                    mask=self.x_mask,
                                    is_train=self.is_train,
                                    func=config.answer_func,
                                    scope='logits1')
                a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                              tf.reshape(logits, [N, M * JX]))

                if config.feed_gt:
                    logy = tf.log(tf.cast(self.y, 'float') + VERY_SMALL_NUMBER)
                    logits = tf.cond(self.is_train, lambda: logy,
                                     lambda: logits)
                if config.feed_hard:
                    hard_yp = tf.argmax(tf.reshape(logits, [N, M * JX]), 1)
                    hard_logits = tf.reshape(tf.one_hot(hard_yp, M * JX),
                                             [N, M, JX])  # [N, M, JX]
                    logits = tf.cond(self.is_train, lambda: logits,
                                     lambda: hard_logits)

                flat_logits = tf.reshape(logits, [-1, M * JX])
                flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
                yp = tf.reshape(flat_yp, [-1, M, JX])

                a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                              [1, M, JX, 1])
                yp_aug = tf.expand_dims(yp, -1)
                g1yp = g1 * yp_aug
                if config.prev_mode == 'a':
                    prev = a1i
                elif config.prev_mode == 'y':
                    prev = yp_aug
                elif config.prev_mode == 'gy':
                    prev = g1yp
                else:
                    raise Exception()
                (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                    d_cell,
                    d_cell,
                    tf.concat(3, [p0, g1, prev, g1 * prev]),
                    x_len,
                    dtype='float',
                    scope='g2')  # [N, M, JX, 2d]
                g2 = tf.concat(3, [fw_g2, bw_g2])
                # logits2 = u_logits(config, self.is_train, tf.concat(3, [g1, a1i]), u, h_mask=self.x_mask, u_mask=self.q_mask, scope="logits2")
                logits2 = get_logits([g2, p0],
                                     d,
                                     True,
                                     wd=config.wd,
                                     input_keep_prob=config.input_keep_prob,
                                     mask=self.x_mask,
                                     is_train=self.is_train,
                                     func=config.answer_func,
                                     scope='logits2')

                flat_logits2 = tf.reshape(logits2, [-1, M * JX])
                flat_yp2 = tf.nn.softmax(flat_logits2)
                yp2 = tf.reshape(flat_yp2, [-1, M, JX])

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
Beispiel #18
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        beam_width = config.beam_width
        GO_TOKEN = 0
        EOS_TOKEN = 1

        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat),
                            trainable=True)
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])
                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(
            cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(
            cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(
            cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(
            cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(
            cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(
            cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw,
                                             d_cell_bw,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), ((_, fw_h_f),
                               (_, bw_h_f)) = bidirectional_dynamic_rnn(
                                   cell_fw,
                                   cell_bw,
                                   xx,
                                   x_len,
                                   dtype='float',
                                   scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), ((_, fw_h_f),
                               (_, bw_h_f)) = bidirectional_dynamic_rnn(
                                   cell_fw,
                                   cell_bw,
                                   xx,
                                   x_len,
                                   dtype='float',
                                   scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell_fw = AttentionCell(
                    cell2_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                first_cell_bw = AttentionCell(
                    cell2_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_fw = AttentionCell(
                    cell3_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_bw = AttentionCell(
                    cell3_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell_fw,
                first_cell_bw,
                p0,
                x_len,
                dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                second_cell_fw,
                second_cell_bw,
                g0,
                x_len,
                dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])

            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell4_fw,
                d_cell4_bw,
                tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)

            if config.na:
                na_bias = tf.get_variable("na_bias", shape=[], dtype='float')
                na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]),
                                        [N, 1])  # [N, 1]
                concat_flat_logits = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits])
                concat_flat_yp = tf.nn.softmax(concat_flat_logits)
                na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]),
                                     [1])
                flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1])

                concat_flat_logits2 = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits2])
                concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2)
                na_prob2 = tf.squeeze(
                    tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1])  # [N]
                flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1])

                self.concat_logits = concat_flat_logits
                self.concat_logits2 = concat_flat_logits2
                self.na_prob = na_prob * na_prob2

            yp = tf.reshape(flat_yp, [-1, M, JX])
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])
            wyp = tf.nn.sigmoid(logits2)

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
            self.wyp = wyp

        with tf.variable_scope("q_gen"):
            # Question Generation Using (Paragraph & Predicted Ans Pos)
            NM = config.max_num_sents * config.batch_size

            # Separated encoder
            #ss = tf.reshape(xx, (-1, JX, dw+dco))

            q_worthy = tf.reduce_sum(
                tf.to_int32(self.y), axis=2
            )  # so we get probability distribution of answer-likely. (N, M)
            q_worthy = tf.expand_dims(tf.to_int32(tf.argmax(q_worthy, axis=1)),
                                      axis=1)  # (N) -> (N, 1)
            q_worthy = tf.concat([
                tf.expand_dims(tf.range(0, N, dtype=tf.int32), axis=1),
                q_worthy
            ],
                                 axis=1)
            # example : [0, 9], [1, 11], [2, 8], [3, 5], [4, 0], [5, 1] ...

            ss = tf.gather_nd(xx, q_worthy)
            syp = tf.expand_dims(tf.gather_nd(yp, q_worthy), axis=-1)
            syp2 = tf.expand_dims(tf.gather_nd(yp2, q_worthy), axis=-1)
            ss_with_ans = tf.concat([ss, syp, syp2], axis=2)

            qg_dim = 600
            cell_fw, cell_bw = rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob), \
                               rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob)
            s_outputs, s_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, ss_with_ans, dtype=tf.float32)
            s_outputs = tf.concat(s_outputs, axis=2)
            s_states = tf.concat(s_states, axis=1)

            start_tokens = tf.zeros([N], dtype=tf.int32)
            self.inp_q_with_GO = tf.concat(
                [tf.expand_dims(start_tokens, axis=1), self.q], axis=1)
            # supervise if mode is train
            if config.mode == "train":
                emb_q = tf.nn.embedding_lookup(params=word_emb_mat,
                                               ids=self.inp_q_with_GO)
                #emb_q = tf.reshape(tf.tile(tf.expand_dims(emb_q, axis=1), [1, M, 1, 1]), (NM, JQ+1, dw))
                train_helper = seq2seq.TrainingHelper(emb_q, [JQ] * N)
            else:
                s_outputs = seq2seq.tile_batch(s_outputs,
                                               multiplier=beam_width)
                s_states = seq2seq.tile_batch(s_states, multiplier=beam_width)

            cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=qg_dim * 2),
                                      input_keep_prob=config.input_keep_prob)
            attention_mechanism = seq2seq.BahdanauAttention(num_units=qg_dim *
                                                            2,
                                                            memory=s_outputs)
            attn_cell = seq2seq.AttentionWrapper(cell,
                                                 attention_mechanism,
                                                 attention_layer_size=qg_dim *
                                                 2,
                                                 output_attention=True,
                                                 alignment_history=False)
            total_glove_vocab_size = 78878  #72686
            out_cell = rnn.OutputProjectionWrapper(attn_cell,
                                                   VW + total_glove_vocab_size)
            if config.mode == "train":
                decoder_initial_states = out_cell.zero_state(
                    batch_size=N, dtype=tf.float32).clone(cell_state=s_states)
                decoder = seq2seq.BasicDecoder(
                    cell=out_cell,
                    helper=train_helper,
                    initial_state=decoder_initial_states)
            else:
                decoder_initial_states = out_cell.zero_state(
                    batch_size=N * beam_width,
                    dtype=tf.float32).clone(cell_state=s_states)
                decoder = seq2seq.BeamSearchDecoder(
                    cell=out_cell,
                    embedding=word_emb_mat,
                    start_tokens=start_tokens,
                    end_token=EOS_TOKEN,
                    initial_state=decoder_initial_states,
                    beam_width=beam_width,
                    length_penalty_weight=0.0)
            outputs = seq2seq.dynamic_decode(decoder=decoder,
                                             maximum_iterations=JQ)
            if config.mode == "train":
                gen_q = outputs[0].sample_id
                gen_q_prob = outputs[0].rnn_output
                gen_q_states = outputs[1]
            else:
                gen_q = outputs[0].predicted_ids[:, :, 0]
                gen_q_prob = tf.nn.embedding_lookup(
                    params=word_emb_mat, ids=outputs[0].predicted_ids[:, :, 0])
                gen_q_states = outputs[1]

            self.gen_q = gen_q
            self.gen_q_prob = gen_q_prob
            self.gen_q_states = gen_q_states
Beispiel #19
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.len_new_emb_mat, config.char_vocab_size, config.hidden_size, \
            config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size
        JX = tf.shape(self.x)[2]  # words
        JQ = tf.shape(self.q)[1]  # words
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        init_word_emb = tf.random_normal_initializer(-0.5, 0.5)
                        #entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded))
                        #entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0))
                        #word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat))
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=init_word_emb)
                        #word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out])
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')

                    #if config.use_glove_for_unk:
                    #    word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat])
                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(
                        word_emb_mat, self.x
                    )  # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size]
                    Aq = tf.nn.embedding_lookup(
                        word_emb_mat, self.q
                    )  # [N, JQ, d] i.e. [batch size, max words, embedding size]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq
        #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20)
        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(
            cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(
            cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(
            cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(
            cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(
            cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(
            cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N,M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw,
                                             d_cell_bw,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), (fw_s, bw_s) = bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]

            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:  # not true
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell_fw = AttentionCell(
                    cell2_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                first_cell_bw = AttentionCell(
                    cell2_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_fw = AttentionCell(
                    cell3_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_bw = AttentionCell(
                    cell3_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(
                    config,
                    self.is_train,
                    h,
                    u,
                    h_mask=self.x_mask,
                    u_mask=self.q_mask,
                    scope="p0",
                    tensor_dict=self.tensor_dict)  # p0 seems to be G in paper
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            #p1 = tf.reshape(p0,[N , M*JX, 8*d])
            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell_fw,
                first_cell_bw,
                p0,
                x_len,
                dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), (my_fw_final_state,
                             my_bw_final_state) = bidirectional_dynamic_rnn(
                                 second_cell_fw,
                                 second_cell_bw,
                                 g0,
                                 x_len,
                                 dtype='float',
                                 scope='g1')  # [N, M, JX, 2d]

            g1 = tf.concat(axis=3, values=[fw_g1,
                                           bw_g1])  # g1 seems to be M in paper
            #g1= tf.reshape(g1,[N, M , JX, 2*d]) #reshaping here again, since g1 is used ahead

            g1 = tf.Print(g1, [tf.shape(g1)],
                          message="g1 shape",
                          first_n=5,
                          summarize=200)
            p0 = tf.Print(p0, [tf.shape(p0)],
                          message="p0 shape",
                          first_n=5,
                          summarize=200)

            g11 = tf.reshape(g1, [N, -1, 2 * d])
            my_encoder_final_state_c = tf.concat(
                values=(my_fw_final_state.c, my_bw_final_state.c),
                axis=1,
                name="my_encoder_final_state_c")
            my_encoder_final_state_h = tf.concat(
                values=(my_fw_final_state.h, my_bw_final_state.h),
                axis=1,
                name="my_encoder_final_state_h")
            my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple(
                c=my_encoder_final_state_c, h=my_encoder_final_state_h)

            #compute indices for finding span as the second task in multi task learning
            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            logits = tf.Print(logits, [tf.shape(logits)],
                              message="logits shape",
                              first_n=5,
                              summarize=200)
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell4_fw,
                d_cell4_bw,
                tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_logits = tf.Print(flat_logits,
                                   [tf.shape(flat_logits), flat_logits],
                                   message="flat_logits shape and contents",
                                   first_n=5,
                                   summarize=200)
            self.flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            self.flat_yp2 = tf.nn.softmax(flat_logits2)

            tgt_vocab_size = config.len_new_emb_mat  # hparam # FIXME: Obtain embeddings differently?
            print("length is", config.len_new_emb_mat)
            nodes = d

            # Look up embedding
            decoder_emb_inp = tf.nn.embedding_lookup(
                word_emb_mat,
                self.decoder_inputs)  # [batch_size, max words, embedding_size]

            with tf.variable_scope("rnn_decoder", reuse=tf.AUTO_REUSE):
                init = tf.random_normal_initializer(0.0, 0.5)
                W_dense = tf.get_variable(name="W_dense",
                                          shape=[2 * nodes, tgt_vocab_size],
                                          dtype=tf.float32,
                                          initializer=init)
                b_dense = tf.get_variable(name="b_dense",
                                          shape=[tgt_vocab_size],
                                          dtype=tf.float32,
                                          initializer=tf.zeros_initializer)

                W_att_dec = tf.get_variable(name="W_att_dec",
                                            shape=[2 * nodes, 2 * nodes],
                                            dtype=tf.float32,
                                            initializer=init)
                W_att_enc = tf.get_variable(name="W_att_enc1",
                                            shape=[1, 1, 2 * nodes, 2 * nodes],
                                            dtype=tf.float32,
                                            initializer=init)
                v_blend = tf.get_variable(name="v_blend",
                                          shape=[1, 2 * nodes],
                                          dtype=tf.float32,
                                          initializer=init)

                pad_time_slice = tf.fill([N], 0, name='PAD')
                pad_step_embedded = tf.nn.embedding_lookup(
                    word_emb_mat, pad_time_slice)

                decoder_cell = tf.contrib.rnn.BasicLSTMCell(
                    2 * nodes, state_is_tuple=True
                )  # doesnt work without the factor of 2??
                '''Loop transition function is a mapping (time, previous_cell_output, previous_cell_state, previous_loop_state) -> 
                (elements_finished, input, cell_state, output, loop_state).
                 It is called before RNNCell to prepare its inputs and state. Everything is a Tensor except for initial call at time=0 
                 when everything is None (except time).'''
                def execute_pointer_network(attn_dist):
                    #this is to find the word in the summary, which recieved highest probability and pass it to the next step in decoder
                    index_pos = tf.argmax(attn_dist, axis=1)
                    index_pos = tf.expand_dims(index_pos, 1)
                    index_pos = tf.concat([
                        tf.reshape(tf.range(start=0, limit=N, dtype=tf.int64),
                                   [N, 1]),
                        tf.zeros([N, 1], tf.int64), index_pos
                    ],
                                          axis=1)
                    index_pos = tf.cast(tf.gather_nd(params=self.x,
                                                     indices=index_pos),
                                        dtype=tf.int64)
                    return index_pos

                def execute_normal_decoder(previous_output, W_dense, b_dense):
                    output_logits = tf.add(tf.matmul(previous_output, W_dense),
                                           b_dense)
                    return tf.argmax(output_logits, axis=1)

                def loop_fn_initial():
                    initial_elements_finished = (
                        0 >= self.target_sequence_length
                    )  # all False at the initial step
                    #initial_input = tf.concat([decoder_emb_inp[:,0], my_encoder_final_state_h], 1)
                    initial_input = decoder_emb_inp[:, 0]
                    initial_cell_state = my_encoder_final_state
                    #setting the correct shapes , as it is used to determine the emit structure
                    initial_cell_output = tf.cond(
                        self.pointer_gen,
                        lambda: tf.zeros([M * JX], tf.float32),
                        lambda: tf.zeros([2 * nodes], tf.float32))
                    initial_loop_state = None  # we don't need to pass any additional information
                    return (initial_elements_finished, initial_input,
                            initial_cell_state, initial_cell_output,
                            initial_loop_state)

                encoder_output = tf.expand_dims(g11, axis=2)

                def loop_fn_transition(time, previous_output, previous_state,
                                       previous_loop_state):
                    def get_next_input():
                        # compute Badhanau style attention
                        #performing convolution or reshaping input to (-1,2*d) and then doing matmul, is essentially the same operation
                        #see matrix_mult.py...conv2d might be faster??
                        #https://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data
                        encoder_features = tf.nn.conv2d(
                            encoder_output, W_att_enc, [1, 1, 1, 1], "SAME"
                        )  # shape (batch_size,max_enc_steps,1,attention_vec_size)
                        dec_portion = tf.matmul(previous_state.h, W_att_dec)
                        decoder_features = tf.expand_dims(
                            tf.expand_dims(dec_portion, 1), 1
                        )  # reshape to (batch_size, 1, 1, attention_vec_size)
                        #python broadcasting will alllow the two features to get added
                        e_not_masked = tf.reduce_sum(
                            v_blend *
                            tf.nn.tanh(encoder_features + decoder_features),
                            [2, 3])  # calculate e, (batch_size, max_enc_steps)
                        #The shape of output of a softmax is the same as the input: it just normalizes the values.
                        attn_dist = tf.nn.softmax(
                            e_not_masked)  # (batch_size, max_enc_steps)
                        attn_dist = tf.Print(attn_dist, [tf.shape(attn_dist)],
                                             message="attn_dist",
                                             first_n=5,
                                             summarize=200)

                        #Multiplying all the 2d vectors with same attn_dist values,and finally keeping 1 2d vector for every batch example
                        context_vector = tf.reduce_sum(
                            tf.reshape(attn_dist, [N, -1, 1, 1]) *
                            encoder_output,
                            [1, 2])  # shape (batch_size, attn_size).
                        context_vector = tf.reshape(context_vector,
                                                    [-1, 2 * nodes])
                        #next_input = tf.cond(self.is_train, lambda: tf.concat(
                        #    [tf.reshape(decoder_emb_inp[:, time], (N, dw)), context_vector], 1),
                        #                     lambda: tf.concat([tf.nn.embedding_lookup(word_emb_mat, prediction), context_vector], 1))
                        #output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense)
                        prediction = tf.cond(
                            self.pointer_gen,
                            lambda: execute_pointer_network(attn_dist),
                            lambda: execute_normal_decoder(
                                previous_output, W_dense, b_dense))

                        with tf.variable_scope("modified_dec_inputs",
                                               reuse=tf.AUTO_REUSE):
                            next_input = tf.cond(
                                self.is_train,
                                lambda: _linear(args=[context_vector] + [
                                    tf.reshape(decoder_emb_inp[:, time],
                                               (N, dw))
                                ],
                                                output_size=dw,
                                                bias=True),
                                lambda: _linear([context_vector] + [
                                    tf.nn.embedding_lookup(
                                        word_emb_mat, prediction)
                                ], dw, True))

                        return next_input, attn_dist

                    elements_finished = (
                        time >= self.target_sequence_length
                    )  # this operation produces boolean tensor of [batch_size]
                    # defining if corresponding sequence has ended
                    finished = tf.reduce_all(
                        elements_finished)  # -> boolean scalar
                    #input = tf.cond(finished, lambda: tf.concat([pad_step_embedded, my_encoder_final_state_h], 1),get_next_input)
                    input, attn_distribution = tf.cond(
                        finished, lambda:
                        (pad_step_embedded, tf.zeros([N, M * JX], tf.float32)),
                        get_next_input)
                    attn_distribution = tf.Print(attn_distribution,
                                                 [tf.shape(attn_distribution)],
                                                 message="attn_distribution",
                                                 first_n=5,
                                                 summarize=200)
                    state = previous_state
                    output = tf.cond(self.pointer_gen,
                                     lambda: attn_distribution,
                                     lambda: previous_output)
                    output = tf.Print(output, [tf.shape(output)],
                                      message="OUTPUT",
                                      first_n=5,
                                      summarize=200)

                    loop_state = None

                    return (elements_finished, input, state, output,
                            loop_state)

                def loop_fn(time, previous_output, previous_state,
                            previous_loop_state):
                    if previous_state is None:  # time == 0
                        assert previous_output is None and previous_state is None
                        return loop_fn_initial()
                    else:
                        return loop_fn_transition(time, previous_output,
                                                  previous_state,
                                                  previous_loop_state)

                decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(
                    decoder_cell, loop_fn)
                decoder_outputs = decoder_outputs_ta.stack()
                decoder_outputs = tf.Print(decoder_outputs,
                                           [tf.shape(decoder_outputs)],
                                           message="decoder_outputs",
                                           first_n=5,
                                           summarize=200)

                # To do output projection, we have to temporarilly flatten decoder_outputs from [max_steps, batch_size, hidden_dim] to
                #  [max_steps*batch_size, hidden_dim], as tf.matmul needs rank-2 tensors at most.
                decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(
                    tf.shape(decoder_outputs))
                decoder_outputs_flat = tf.reshape(decoder_outputs,
                                                  (-1, decoder_dim))
                #if pointer networks, no need to pass through dense layer
                decoder_logits_flat = tf.cond(
                    self.pointer_gen,
                    lambda: decoder_outputs_flat, lambda: tf.add(
                        tf.matmul(decoder_outputs_flat, W_dense), b_dense))
                decoder_logits = tf.cond(
                    self.pointer_gen, lambda: tf.reshape(
                        decoder_logits_flat,
                        (decoder_max_steps, decoder_batch_size, decoder_dim)),
                    lambda: tf.reshape(decoder_logits_flat,
                                       (decoder_max_steps, decoder_batch_size,
                                        tgt_vocab_size)))
                decoder_logits = _transpose_batch_time(decoder_logits)
                #decoder_prediction = tf.argmax(decoder_logits, -1)

            #self.decoder_logits_train = final_outputs.rnn_output
            self.decoder_logits_train = decoder_logits
            self.index_start = flat_logits
            self.index_end = flat_logits2
    def main_graph(self,
                   trained_model,
                   scope,
                   emb_dim,
                   gru,
                   rnn_dim,
                   rnn_num,
                   drop_out=0.5,
                   emb=None,
                   ngram_embedding=None):
        """
        :param trained_model:
        :param scope:
        :param emb_dim:
        :param gru:
        :param rnn_dim:
        :param rnn_num:
        :param drop_out:
        :param emb:
        :return:
        """
        # trained_model: 模型存储路径
        if trained_model is not None:
            param_dic = {
                'nums_chars': self.nums_chars,
                'nums_tags': self.nums_tags,
                'tag_scheme': self.tag_scheme,
                'crf': self.crf,
                'emb_dim': emb_dim,
                'gru': gru,
                'rnn_dim': rnn_dim,
                'rnn_num': rnn_num,
                'drop_out': drop_out,
                'buckets_char': self.buckets_char,
                'ngram': self.ngram
            }
            print "RNN dimension is %d" % rnn_dim
            print "RNN number is %d" % rnn_num
            print "Character embedding size is %d" % emb_dim
            # 存储模型超参数
            if self.metric == 'All':
                # rindex() 返回子字符串 str 在字符串中最后出现的位置
                # 截取模型文件名
                pindex = trained_model.rindex('/') + 1
                for m in self.all_metrics:
                    f_model = open(
                        trained_model[:pindex] + m + '_' +
                        trained_model[pindex:], 'w')
                    pickle.dump(param_dic, f_model)
                    f_model.close()
            else:
                f_model = open(trained_model, 'w')
                pickle.dump(param_dic, f_model)
                f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        # 字向量层
        # 为什么字符数要加 500 ?
        # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置
        # weights 表示预训练的字向量,可以通过命令行参数设置
        self.emb_layer = EmbeddingLayer(self.nums_chars + 500,
                                        emb_dim,
                                        weights=emb,
                                        name='emb_layer')

        if self.ngram is not None:
            if ngram_embedding is not None:
                assert len(ngram_embedding) == len(self.ngram)
            else:
                ngram_embedding = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(
                    EmbeddingLayer(n_gram + 1000 * (i + 2),
                                   emb_dim,
                                   weights=ngram_embedding[i],
                                   name=str(i + 2) + 'gram_layer'))

        # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2
        # 输出维度即标签个数
        tag_output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2,
                                                         self.nums_tags[0],
                                                         activation='linear',
                                                         name='tag_hidden'),
                                             name='tag_output_wrapper')

        if self.char_freq_loss:
            freq_output_wrapper = TimeDistributed(HiddenLayer(
                rnn_dim * 2, 1, activation='sigmoid', name='freq_hidden'),
                                                  name='freq_output_wrapper')

        if self.co_train:
            lm_fw_wrapper = TimeDistributed(HiddenLayer(rnn_dim,
                                                        self.nums_chars + 2,
                                                        activation='linear',
                                                        name='lm_fw_hidden'),
                                            name='lm_fw_wrapper')
            lm_bw_wrapper = TimeDistributed(HiddenLayer(rnn_dim,
                                                        self.nums_chars + 2,
                                                        activation='linear',
                                                        name='lm_bw_hidden'),
                                            name='lm_bw_wrapper')

        # define model for each bucket
        # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型
        # bucket: bucket 中的句子长度
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer)
                # 只需要设置一次 reuse,后面就都 reuse 了
                scope.reuse_variables()
            t1 = time()

            # 输入的句子,one-hot 向量
            # shape = (batch_size, 句子长度)
            input_sentences = tf.placeholder(tf.int32, [None, bucket],
                                             name='input_' + str(bucket))

            self.input_v.append([input_sentences])

            emb_set = []
            word_out = self.emb_layer(input_sentences)
            emb_set.append(word_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket],
                                             name='input_g' + str(i) +
                                             str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if len(emb_set) > 1:
                # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等)
                word_embeddings = tf.concat(axis=2, values=emb_set)

            else:
                word_embeddings = emb_set[0]

            # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值
            rnn_out_fw, rnn_out_bw = BiRNN(rnn_dim,
                                           p=dr,
                                           concat_output=False,
                                           gru=gru,
                                           name='BiLSTM' + str(bucket),
                                           scope='Tag-BiRNN')(word_embeddings,
                                                              input_sentences)

            tag_rnn_out_fw, tag_rnn_out_bw = rnn_out_fw, rnn_out_bw
            if self.co_train:
                if self.highway_layers > 0:
                    tag_rnn_out_fw = highway_network(rnn_out_fw,
                                                     self.highway_layers,
                                                     True,
                                                     is_train=True,
                                                     scope="tag_fw")
                    tag_rnn_out_bw = highway_network(rnn_out_bw,
                                                     self.highway_layers,
                                                     True,
                                                     is_train=True,
                                                     scope="tag_bw")
            tag_rnn_out = tf.concat(values=[tag_rnn_out_fw, tag_rnn_out_bw],
                                    axis=2)

            # 应用全连接层,Wx+b 得到最后的输出
            output = tag_output_wrapper(tag_rnn_out)
            # 为什么要 [output] 而不是 output 呢?
            self.output.append([output])

            self.output_.append([
                tf.placeholder(tf.int32, [None, bucket],
                               name='tags' + str(bucket))
            ])

            self.bucket_dit[bucket] = idx

            if self.co_train:
                # language model
                lm_rnn_out_fw, lm_rnn_out_bw = rnn_out_fw, rnn_out_bw
                if self.highway_layers > 0:
                    lm_rnn_out_fw = highway_network(rnn_out_fw,
                                                    self.highway_layers,
                                                    True,
                                                    is_train=True,
                                                    scope="lm_fw")
                    lm_rnn_out_bw = highway_network(rnn_out_bw,
                                                    self.highway_layers,
                                                    True,
                                                    is_train=True,
                                                    scope="lm_bw")

                self.lm_fw_predictions.append([lm_fw_wrapper(lm_rnn_out_fw)])
                self.lm_bw_predictions.append([lm_bw_wrapper(lm_rnn_out_bw)])
                self.lm_fw_groundtruthes.append([
                    tf.placeholder(tf.int32, [None, bucket],
                                   name='lm_fw_targets' + str(bucket))
                ])
                self.lm_bw_groundtruthes.append([
                    tf.placeholder(tf.int32, [None, bucket],
                                   name='lm_bw_targets' + str(bucket))
                ])

            if self.char_freq_loss:
                freq_rnn_out_fw, freq_rnn_out_bw = rnn_out_fw, rnn_out_bw
                if self.highway_layers > 0:
                    freq_rnn_out_fw = highway_network(rnn_out_fw,
                                                      self.highway_layers,
                                                      True,
                                                      is_train=True,
                                                      scope="freq_fw")
                    freq_rnn_out_bw = highway_network(rnn_out_bw,
                                                      self.highway_layers,
                                                      True,
                                                      is_train=True,
                                                      scope="freq_bw")
                freq_rnn_out = tf.concat(
                    values=[freq_rnn_out_fw, freq_rnn_out_bw], axis=2)

                self.char_freq_groundtruthes.append([
                    tf.placeholder(tf.float32, [None, bucket],
                                   name='freq_targets_%d' % bucket)
                ])
                self.char_freq_predictions.append(
                    [freq_output_wrapper(freq_rnn_out)])

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert \
            len(self.input_v) == len(self.output) and \
            len(self.output) == len(self.output_) and \
            len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()