Ejemplo n.º 1
0
 def create_attention(self):
     with tf.name_scope('attention'):
         self.ct = func.dot_attention(self.decoder_h, self.passage_enc,
                                      self.passage_mask,
                                      config.dot_attention_dim,
                                      self.input_keep_prob)
         self.combined_h = tf.concat(
             [self.decoder_h, self.ct], -1,
             name='combined_h')  #[batch, question_len, 450]
         self.wt = tf.get_variable('wt',
                                   shape=[
                                       config.max_question_len,
                                       self.combined_h.get_shape()[-1],
                                       config.decoder_hidden_dim
                                   ])
         self.ws = tf.get_variable(
             'ws', shape=[config.decoder_hidden_dim, self.vocab_size])
         question_len = tf.shape(self.combined_h)[1]
         self.wt_h = tf.einsum('bij,ijk->bik',
                               self.combined_h,
                               self.wt[:question_len, :, :],
                               name='wt_h')
         self.ws_tanh_wt = tf.einsum('bik,kj->bij', tf.tanh(self.wt_h),
                                     self.ws)
Ejemplo n.º 2
0
    def ready(self):
        config = self.config
        N, PL, QL, d = config.batch_size, self.c_maxlen, self.q_maxlen, config.hidden,
        gru = cudnn_gru if config.use_cudnn else native_gru

        # 词向量层
        with tf.variable_scope("emb"):
            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)
                alter_emb = tf.nn.embedding_lookup(self.word_mat,
                                                   self.alternatives)

        # 编码层
        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            # [batch, c, 2*d*3]
            # 2:双向gru;3:连接3层的output作为最后的输出
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        # with tf.variable_scope("mru_encoder"):
        #     c_m = mru(c, self.c_maxlen, self.c_mask, mru_range, 250)

        with tf.variable_scope("q2c"):
            # q2c.shape=[b,c,c.shape[-1]+q.shape[-1]]=[b,c,12d]
            q2c = dot_attention(c,
                                q,
                                mask=self.q_mask,
                                keep_prob=config.keep_prob,
                                is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=q2c.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            # v_c.shape=[b,c,2d]
            v_c = rnn(q2c, seq_len=self.c_len)

        with tf.variable_scope("q2o"):
            # alter.shape=[b,3,6d]
            alter = tf.layers.dense(alter_emb,
                                    units=6 * d,
                                    activation=tf.nn.relu)
            # q2o.shape=[b,3,12d]
            q2o = dot_attention(alter,
                                q,
                                mask=self.q_mask,
                                keep_prob=config.keep_prob,
                                is_train=self.is_train)

        with tf.variable_scope("o2c"):
            # v_o.shape=[b,3,2d]
            v_o = tf.layers.dense(q2o, units=2 * d, activation=tf.nn.relu)
            # o2c.shape=[b,c,4d]
            o2c = dot_attention(v_c,
                                v_o,
                                mask=self.alter_mask,
                                keep_prob=config.keep_prob,
                                is_train=self.is_train)
            r_c = tf.reduce_mean(o2c, axis=1, keepdims=True)
        with tf.variable_scope("predict"):
            # logits.shape=[b,3]
            logits = tf.reshape(bilinear(r_c, v_o),
                                [N, v_o.get_shape().as_list()[1]])
            self.yp = tf.argmax(tf.nn.softmax(logits), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits, labels=tf.stop_gradient(self.y))
            self.loss = tf.reduce_mean(losses)
Ejemplo n.º 3
0
    def ready(self):
        config = self.config
        N, PL, QL, d = config.batch_size, self.c_maxlen, self.q_maxlen, config.hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.name_scope("embedding"):
            with tf.name_scope("title"):
                t_emb = tf.nn.embedding_lookup(self.word_mat, self.t)

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

        # c_emb = tf.concat([c_emb, ch_emb], axis=2)
        # q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(
                att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
            )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        # answer predict
        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            #对答案区间进行限制
            #outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        # document selected
        with tf.variable_scope("select"):
            # batch_size dim
            c_cum = attention_pooling(match, init, self.c_mask, hidden=d)
            fuse = tf.concat([c_cum, init], axis=1)
            fuse = dense(fuse, hidden=d, use_bias=False, scope = "fully1")
            fuse = dense(fuse, hidden=1, use_bias=False, scope = "fully2")
            # batch_size 1
            logits_s = tf.sigmoid(fuse)
            fuse = tf.squeeze(fuse)
            self.s = tf.cast(self.s, tf.float32)
            self.loss_s = tf.nn.sigmoid_cross_entropy_with_logits(logits=fuse, labels=self.s)
Ejemplo n.º 4
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.ch), [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.qh), [N * QL, CL, dc])
                ch_emb = dropout(
                    ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                qh_emb = dropout(
                    qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(
                att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
            )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
Ejemplo n.º 5
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.ch), [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.qh), [N * QL, CL, dc])
                ch_emb = dropout(
                    ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                qh_emb = dropout(
                    qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(
                att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len) #[10, ?,300]

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
            )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("content_modeling"):

            logits4, c_semantics = content_model(init, match, config.hidden)

        with tf.variable_scope("cross_passage_attention"):
            self.query_num = int(config.batch_size/config.passage_num)
            c_semantics = tf.reshape(c_semantics, shape=[self.query_num, config.passage_num, -1])
            attnc_key = tf.tile(tf.expand_dims(c_semantics, axis=2), [1, 1, config.passage_num, 1])
            attnc_mem = tf.tile(tf.expand_dims(c_semantics, axis=1), [1, config.passage_num, 1, 1])
            attnc_w = tf.reduce_sum(attnc_key*attnc_mem, axis=-1)
            attnc_mask = tf.ones([config.passage_num, config.passage_num])-tf.diag([1.0]*config.passage_num)
            attnc_w = tf.nn.softmax(attnc_w*attnc_mask, axis=-1)
            attncp = tf.reduce_sum(tf.tile(tf.expand_dims(attnc_w, axis=-1), [1, 1, 1, 2*config.hidden])*attnc_mem, axis= 2)
        
        
        with tf.variable_scope("pseudo_label"):
            self.is_select = tf.reshape(tf.squeeze(self.is_select), shape=[self.query_num, config.passage_num])
            self.is_select = self.is_select/tf.tile(tf.reduce_sum(self.is_select, axis=-1, keepdims=True), [1, config.passage_num])
            sim_matrix = attnc_w
            lb_matrix = tf.tile(tf.expand_dims(self.is_select, axis=1), [1, config.passage_num, 1])
            self.pse_is_select = tf.reduce_sum(sim_matrix*lb_matrix, axis=-1) + tf.constant([0.00000001]*config.passage_num, dtype=tf.float32)    # avoid all zero
            self.pse_is_select = self.pse_is_select/tf.tile(tf.reduce_sum(self.pse_is_select, axis=-1, keepdims=True), [1,config.passage_num])
            alpha = 0.7
            self.fuse_label = alpha*self.is_select + (1-alpha)*tf.stop_gradient(self.pse_is_select)
        

        with tf.variable_scope("predict_passage"):
            init = tf.reshape(init, shape=[self.query_num, config.passage_num, -1])
            attn_concat = tf.concat([init, attncp, c_semantics], axis=-1)
            d1 = tf.layers.dense(attn_concat, 2*config.hidden, activation= tf.nn.leaky_relu, bias_initializer= tf.glorot_uniform_initializer()) #150
            d2 = tf.layers.dense(d1, config.hidden, activation= tf.nn.leaky_relu, bias_initializer= tf.glorot_uniform_initializer()) #75
            logits3 = tf.squeeze(tf.layers.dense(d2, 1, activation= None, bias_initializer= tf.glorot_uniform_initializer()))
        
        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 30)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            #logits3 = tf.reduce_max(tf.reduce_max(outer, axis=2), axis=1)
            self.is_select_p = tf.nn.sigmoid(logits3)

            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=tf.stop_gradient(self.y1))
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=tf.stop_gradient(self.y2))
           
            weighted_losses = weighted_loss(config, 0.000001, self.y1, losses) #0.01
            weighted_losses2 = weighted_loss(config, 0.000001, self.y2, losses2) #0.01
            
            losses3 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits3, labels=tf.stop_gradient(self.fuse_label)))
            
            in_answer_weight = tf.ones_like(self.in_answer) + 3*self.in_answer
            
            losses4 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits4, labels=tf.stop_gradient(self.in_answer))*in_answer_weight, axis=-1)

            weighted_losses4 = weighted_loss(config, 0.000001, self.in_answer, losses4)
            
            self.loss_dict = {'pos_s loss':losses, 'pos_e loss':losses2, 'select loss':losses3, 'in answer':losses4}
            for key, values in self.loss_dict.items():
                self.loss_dict[key] = tf.reduce_mean(values)
            
            self.loss = tf.reduce_mean(weighted_losses + weighted_losses2 + losses3+ weighted_losses4)
    def get_vP(self, i, att_vP, q_, answer_info, y1, y2, c_pr_mask, cmax_c,
               clen_c):
        # max para limit
        config = self.config

        opt = True
        MPL = config.para_limit
        zero = tf.constant(0, dtype=tf.int32)
        j = tf.constant(0, dtype=tf.int32)

        c = self.c_pr[:, i * MPL:(i + 1) * MPL]
        ch = self.ch_pr[:, i * MPL:(i + 1) * MPL, :]
        qh = self.qh
        q = self.q

        c_mask = tf.cast(c, tf.bool)
        q_mask = self.q_mask

        # passage ranking line:
        #self.pr_mask = tf.cast(self.p, tf.bool)

        c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1)
        c_len_int = tf.reshape(c_len, [config.batch_size, 1])
        q_len = self.q_len

        if opt:
            N, CL = config.batch_size, config.char_limit
            c_maxlen = tf.reduce_max(c_len)
            c_maxlen_int = tf.reshape(tf.reduce_max(c_len_int), [1])
            q_maxlen = q_len
            c = tf.slice(c, [0, 0], [N, c_maxlen])
            c_mask = tf.slice(c_mask, [0, 0], [N, c_maxlen])
            q_mask = self.q_mask
            ch = tf.slice(ch, [0, 0, 0], [N, c_maxlen, CL])
            qh = self.qh

            temp = self.y2[:, i * MPL:(i + 1) * MPL]
            #self.y1 = tf.Print(self.y1,["y1:",tf.shape(self.y1)])
            #self.y2 = tf.Print(self.y2,["y2:",tf.shape(self.y2)])
            y1__ = tf.slice(self.y1, [0, i * MPL], [N, c_maxlen])
            #y1__ = tf.Print(y1__,["y1__:",tf.shape(y1__)])

            y2__ = tf.slice(self.y2, [0, i * MPL], [N, c_maxlen])

            def b1():
                return c_mask

            def b2():
                return tf.concat([c_pr_mask, c_mask], axis=1)

            c_pr_mask = tf.cond(tf.equal(i, zero), b1, b2)

            def b3():
                return c_maxlen_int, c_len_int

            def b4():
                print(clen_c.get_shape(), c_len_int.get_shape())
                a = tf.concat([cmax_c, c_maxlen_int], axis=0)
                b = tf.concat([clen_c, c_len_int], axis=1)
                return a, b

            cmax_c, clen_c = tf.cond(tf.equal(i, zero), b3, b4)
            # passage ranking
            #print(self.ch_pr.get_shape())
            #print(self.c_pr.get_shape())
            #c_pr_mask = tf.cast(self.c_pr, tf.bool)
            #c_pr_mask = tf.slice(self.c_pr_mask, [0, i*MPL], [N, c_maxlen])
            ###
            ###
            #ch_pr = tf.slice(self.ch_pr, [0, i*MPL, 0], [N, c_maxlen, CL])
        else:
            self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit

        ch_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(ch, tf.bool), tf.int32), axis=2),
            [-1])
        qh_len = self.qh_len

        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, c_maxlen, self.q_maxlen, \
         config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn_gru else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                #CL = tf.Print(CL,[CL],message="CL:")
                #PL = tf.Print(PL,[PL],message="PL:")
                #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:")
                #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL])
                #print(self.ch.get_shape())
                #print(self.ch_pr.get_shape())
                #print(self.c.get_shape())
                #print(self.c_pr.get_shape())
                #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr")
                ch_emb = tf.reshape(tf.nn.embedding_lookup(\
                 self.char_mat, ch), [N * PL, CL, dc])
                #	self.char_mat, self.ch), [N * PL, CL, dc])
                print(ch.shape, PL)
                print(qh.shape, QL)
                qh_emb = tf.reshape(tf.nn.embedding_lookup(\
                 self.char_mat, qh), [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb")
                #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb")
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    self.cell_fw,
                    self.cell_bw,
                    ch_emb,
                    ch_len,
                    dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    self.cell_fw,
                    self.cell_bw,
                    qh_emb,
                    qh_len,
                    dtype=tf.float32)
                #state_fw = tf.Print(state_fw,[state_fw],message="state_fw")
                #state_bw = tf.Print(state_bw,[state_bw],message="state_bw")
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])
                #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb")
            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding", reuse=tf.AUTO_REUSE):
            """
			def f1():
				self.rnn1 = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
				).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
				return self.rnn1(c_emb, seq_len=self.c_len)
			def f2():
				return self.rnn1(c_emb, seq_len=self.c_len)
			c = tf.cond(tf.equal(i, zero), f1, f2)
			#q = tf.cond(tf.equal(i, zero), f1, f2)
			#c = rnn(c_emb, seq_len=self.c_len)
			q = self.rnn1(q_emb, seq_len=self.q_len)
			self.q_enc = q
			#self.rnn1 = rnn
			"""
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)

            c = rnn(c_emb, seq_len=c_len)
            q = rnn(q_emb, seq_len=q_len)
            #c_len = tf.Print(c_len,[c_len,tf.shape(c)],message="C:")
            #self.q_enc = q
            q__ = q

        with tf.variable_scope("attention", reuse=tf.AUTO_REUSE):
            qc_att = dot_attention(c,
                                   q,
                                   mask=q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train,
                                   name_scope="attention_layer")
            """
			print("qc_att:",qc_att.shape)
			def f3():
				self.rnn2 = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
				).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
				return self.rnn2(qc_att, seq_len=self.c_len)
			def f4():
				return self.rnn2(qc_att, seq_len=self.c_len)
			att = tf.cond(tf.equal(self.i, zero), f3, f4)
			"""
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=c_len)

            ###
            #att = tf.Print(att,[tf.greater(tf.cast(tf.shape(att)[1],tf.int64),y1_),
            #	tf.shape(att)],message="att:")
            def f5():
                return att

            def f6():
                return tf.concat([att_vP, att], axis=1)

            #att = rnn(qc_att, seq_len=self.c_len)
            #self.rnn2 = rnn
            # att is the v_P
            att_vP = tf.cond(tf.equal(i, zero), f5, f6)

        def f7():
            return y1__, y2__

        def f8():
            return tf.concat([y1, y1__], axis=1), tf.concat([y2, y2__], axis=1)

        y1, y2 = tf.cond(tf.equal(i, zero), f7, f8)

        return tf.add(i, tf.constant(
            1)), att_vP, q__, answer_info, y1, y2, c_pr_mask, cmax_c, clen_c
Ejemplo n.º 7
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        gi = []
        att_vP = []
        for i in range(config.max_para):
            with tf.variable_scope("emb"):
                with tf.variable_scope("char"):
                    ch_emb = tf.reshape(tf.nn.embedding_lookup(\
                     self.char_mat, self.pr_ch), [N * PL, CL, dc])
                    #	self.char_mat, self.ch), [N * PL, CL, dc])
                    qh_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.char_mat, self.qh),
                        [N * QL, CL, dc])
                    ch_emb = dropout(ch_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    qh_emb = dropout(qh_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    cell_fw = tf.contrib.rnn.GRUCell(dg)
                    cell_bw = tf.contrib.rnn.GRUCell(dg)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        ch_emb,
                        self.ch_len,
                        dtype=tf.float32)
                    ch_emb = tf.concat([state_fw, state_bw], axis=1)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        qh_emb,
                        self.qh_len,
                        dtype=tf.float32)
                    qh_emb = tf.concat([state_fw, state_bw], axis=1)
                    qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                    ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

                with tf.name_scope("word"):
                    c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                    q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

            with tf.variable_scope("encoding"):
                rnn = gru(num_layers=3,
                          num_units=d,
                          batch_size=N,
                          input_size=c_emb.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)
                c = rnn(c_emb, seq_len=self.c_len)
                q = rnn(q_emb, seq_len=self.q_len)

            with tf.variable_scope("attention"):
                qc_att = dot_attention(c,
                                       q,
                                       mask=self.q_mask,
                                       hidden=d,
                                       keep_prob=config.keep_prob,
                                       is_train=self.is_train)
                rnn = gru(num_layers=1,
                          num_units=d,
                          batch_size=N,
                          input_size=qc_att.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)
                att = rnn(qc_att, seq_len=self.c_len)
                # att is the v_P
                att_vP.append(att)
            """
			with tf.variable_scope("match"):
				self_att = dot_attention(
					att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
				rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
				).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
				match = rnn(self_att, seq_len=self.c_len)
			"""
            with tf.variable_scope("pointer"):

                # r_Q:
                init = summ(q[:, :, -2 * d:],
                            d,
                            mask=self.q_mask,
                            keep_prob=config.ptr_keep_prob,
                            is_train=self.is_train)

                pointer = ptr_net(batch=N,
                                  hidden=init.get_shape().as_list()[-1],
                                  keep_prob=config.ptr_keep_prob,
                                  is_train=self.is_train)
                logits1, logits2 = pointer(init, att, d, self.c_mask)

            with tf.variable_scope("predict"):
                outer = tf.matmul(
                    tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                    tf.expand_dims(tf.nn.softmax(logits2), axis=1))
                outer = tf.matrix_band_part(outer, 0, 15)
                self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
                self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=logits1, labels=self.y1)
                losses2 = tf.nn.softmax_cross_entropy_with_logits(
                    logits=logits2, labels=self.y2)
                self.loss = tf.reduce_mean(losses + losses2)

                # print losses
                #condition = tf.greater(self.loss, 11)
                #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1)
                #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1)

        for i in range(config.max_para):
            # Passage ranking
            with tf.variable_scope("passage-ranking-attention"):
                vj_P = dropout(att, keep_prob=keep_prob, is_train=is_train)
                r_Q = dropout(init, keep_prob=keep_prob, is_train=is_train)
                r_P = attention(r_Q,
                                vj_P,
                                mask=self.c_mask,
                                hidden=d,
                                keep_prob=config.keep_prob,
                                is_train=self.is_train)

                #rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=pr_att.get_shape(
                #).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
                #att_rp = rnn(qc_att, seq_len=self.c_len)

                # Wg
                concatenate = tf.concat([init, att_rp], axis=2)
                g = tf.nn.tanh(
                    dense(concatenate, hidden=d, use_bias=False, scope="g"))
                g_ = dense(g, 1, use_bias=False, scope="g_")
                gi.append(g_)
        gi_ = tf.convert_to_tensor(gi)
        gi = tf.nn.softmax(gi_)
        self.pr_loss = tf.nn.softmax_cross_entropy_with_logits(logits=gi,
                                                               labels=self.pr)
Ejemplo n.º 8
0
def encoder(source, params):
    mask = dtype.tf_to_float(tf.cast(source, tf.bool))
    hidden_size = params.hidden_size
    initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5)

    source, mask = util.remove_invalid_seq(source, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "src_embedding"
    src_emb = tf.get_variable(embed_name,
                              [params.src_vocab.size(), params.embed_size],
                              initializer=initializer)
    src_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(src_emb, source) * (hidden_size**0.5)
    inputs = tf.nn.bias_add(inputs, src_bias)
    inputs = func.add_timing_signal(inputs)

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    with tf.variable_scope("encoder"):
        x = inputs
        for layer in range(params.num_encoder_layer):
            if params.deep_transformer_init:
                layer_initializer = tf.variance_scaling_initializer(
                    params.initializer_gain * (layer + 1)**-0.5,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                layer_initializer = None
            with tf.variable_scope("layer_{}".format(layer),
                                   initializer=layer_initializer):
                with tf.variable_scope("self_attention"):
                    y = func.dot_attention(x,
                                           None,
                                           func.attention_bias(
                                               mask, "masking"),
                                           hidden_size,
                                           num_heads=params.num_heads,
                                           dropout=params.attention_dropout)

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("feed_forward"):
                    y = func.ffn_layer(
                        x,
                        params.filter_size,
                        hidden_size,
                        dropout=params.relu_dropout,
                    )

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

    source_encodes = x
    x_shp = util.shape_list(x)

    return {
        "encodes": source_encodes,
        "decoder_initializer": {
            "layer_{}".format(l): {
                # plan aan
                "aan": dtype.tf_to_float(tf.zeros([x_shp[0], 1, hidden_size])),
            }
            for l in range(params.num_decoder_layer)
        },
        "mask": mask
    }
Ejemplo n.º 9
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = native_rnn

        c_elmo_features = self.elmo(self.c_elmo)
        q_elmo_features = self.elmo(self.q_elmo)

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_elmo_emb = weight_layers('embedding',
                                       c_elmo_features,
                                       l2_coef=0.0,
                                       do_layer_norm=False)['weighted_op']
            tf.get_variable_scope().reuse_variables()
            q_elmo_emb = weight_layers('embedding',
                                       q_elmo_features,
                                       l2_coef=0.0,
                                       do_layer_norm=False)['weighted_op']

            c_elmo_emb = dropout(c_elmo_emb,
                                 keep_prob=config.elmo_keep_prob,
                                 is_train=self.is_train)
            q_elmo_emb = dropout(q_elmo_emb,
                                 keep_prob=config.elmo_keep_prob,
                                 is_train=self.is_train)

            c_emb = tf.concat([c_emb, ch_emb, c_elmo_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb, q_elmo_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(config.cell,
                      num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            rnn = gru(config.cell,
                      num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
            rnn = gru(config.cell,
                      num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

            c_elmo_enc = weight_layers('encoding',
                                       c_elmo_features,
                                       l2_coef=0.0,
                                       do_layer_norm=False)['weighted_op']
            tf.get_variable_scope().reuse_variables()
            q_elmo_enc = weight_layers('encoding',
                                       q_elmo_features,
                                       l2_coef=0.0,
                                       do_layer_norm=False)['weighted_op']

            c_elmo_enc = dropout(c_elmo_enc,
                                 keep_prob=config.elmo_keep_prob,
                                 is_train=self.is_train)
            q_elmo_enc = dropout(q_elmo_enc,
                                 keep_prob=config.elmo_keep_prob,
                                 is_train=self.is_train)

            match = tf.concat([match, c_elmo_enc], -1)
            q = tf.concat([q, q_elmo_enc], -1)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=tf.stop_gradient(self.y1))
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=tf.stop_gradient(self.y2))
            self.loss = tf.reduce_mean(losses + losses2)
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train,
                                   name_scope="attention_layer")
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)
            tf.summary.histogram('vt_P', att)
            self.att_logits = tf.get_collection('Softmax_logits')[0]
            self.att_outputs = tf.get_collection('MatMul_outputs')[0]

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train,
                                     name_scope="match_layer")
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)
            tf.summary.histogram('self_match', match)
            self.match_logits = tf.get_collection('Softmax_logits')[1]
            self.match_outputs = tf.get_collection('MatMul_outputs')[1]

        with tf.variable_scope("pointer"):
            # r_Q:
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)

            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)
            tf.summary.histogram('rQ_init', init)
            tf.summary.histogram('pointer_logits_1', logits1)
            tf.summary.histogram('pointer_logits_2', logits2)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
            ####
            self.predict_outer_start = tf.reduce_max(outer, axis=2)
            self.predict_outer_end = tf.reduce_max(outer, axis=1)
            """
Ejemplo n.º 11
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = \
            config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, \
            config.char_dim, config.char_hidden
        gru = CudnnGRU if config.use_cudnn else NativeGRU

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)  # representation of paragraph
            q = rnn(q_emb, seq_len=self.q_len)  # representation of question

        with tf.variable_scope(
                "attention"
        ):  # gated att rnn (using dot att from Attention is All You Need actually)
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):  # self-matching rnn
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            pointer = PointerNet(batch=N,
                                 hidden=init.get_shape().as_list()[-1],
                                 keep_prob=config.ptr_keep_prob,
                                 is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
Ejemplo n.º 12
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.ch), [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.qh), [N * QL, CL, dc])
                ch_emb = dropout(
                    ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                qh_emb = dropout(
                    qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = tf.stop_gradient(c_emb)
            q_emb = tf.stop_gradient(q_emb)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            self.c_rnn = c = rnn(c_emb, seq_len=self.c_len)
            self.q_rnn = q = rnn(q_emb, seq_len=self.q_len)

            c = tf.stop_gradient(c)
            q = tf.stop_gradient(q)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            
            self.att = [rnn(qc_att, seq_len=self.c_len)[:,-1,:]]

        #self.att = [tf.concat([self.c_rnn[:,-1,:], self.q_rnn[:,-1,:]], 1)]
            
        #self.att += [tf.stop_gradient(self.att[-1])]
        
        with tf.variable_scope("binary"):
            for _ in range(3):
                self.att += [tf.nn.dropout(tf.keras.layers.Dense(300, activation='relu')(self.att[-1]), keep_prob=config.keep_prob)]

            self.prediction = tf.keras.layers.Dense(2)(self.att[-1])

        #self.loss = tf.reduce_mean(tf.squared_difference(self.prediction, tf.cast(self.y_target, tf.float32)))
        self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.prediction, labels=tf.stop_gradient(self.y_target))
Ejemplo n.º 13
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])

                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)

                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)

                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])
            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            #3层 lstm对输出进行编码
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)

            #with the size(batch_size,max_len,hidden_dim)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("relation analysis"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            g_theta_layers = [256, 128, 1]  # attention component
            md = Relation_Module(config, self.c_maxlen, self.q_maxlen,
                                 g_theta_layers)
            #r add attention weight with q_summary
            r, alpha = md.hop_2(c,
                                init,
                                phase=self.is_train,
                                activation=tf.nn.relu)
            c = r[-1]

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

#通过embedding q 获得rQ
        with tf.variable_scope("pointer"):
            # init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
            #             keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            self.start_logits = tf.nn.softmax(logits1)
            self.stop_logits = tf.nn.softmax(logits2)
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
Ejemplo n.º 14
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, BL, d, dc, dg, dbpe, dbpeh = config.batch_size, self.c_maxlen, self.q_maxlen, \
                                                   config.char_limit, config.bpe_limit, config.hidden, \
                                                   config.glove_dim if config.pretrained_char else config.char_dim, config.char_hidden, \
                                                   config.bpe_glove_dim if config.pretrained_bpe_emb else config.bpe_dim, config.bpe_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            if config.use_char:
                with tf.variable_scope("char"):
                    ch_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.char_mat, self.ch),
                        [N * PL, CL, dc])
                    qh_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.char_mat, self.qh),
                        [N * QL, CL, dc])
                    ch_emb = dropout(ch_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    qh_emb = dropout(qh_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    cell_fw = tf.contrib.rnn.GRUCell(dg)
                    cell_bw = tf.contrib.rnn.GRUCell(dg)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        ch_emb,
                        self.ch_len,
                        dtype=tf.float32)
                    ch_emb = tf.concat([state_fw, state_bw], axis=1)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        qh_emb,
                        self.qh_len,
                        dtype=tf.float32)
                    qh_emb = tf.concat([state_fw, state_bw], axis=1)
                    qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                    ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            if config.use_bpe:
                with tf.variable_scope("bpe"):
                    cb_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.bpe_mat, self.cb),
                        [N * PL, BL, dbpe])
                    qb_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.bpe_mat, self.qb),
                        [N * QL, BL, dbpe])
                    cb_emb = dropout(cb_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    qb_emb = dropout(qb_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    cell_fw = tf.contrib.rnn.GRUCell(dbpeh)
                    cell_bw = tf.contrib.rnn.GRUCell(dbpeh)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        cb_emb,
                        self.cb_len,
                        dtype=tf.float32)
                    cb_emb = tf.concat([state_fw, state_bw], axis=1)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        qb_emb,
                        self.qb_len,
                        dtype=tf.float32)
                    qb_emb = tf.concat([state_fw, state_bw], axis=1)
                    qb_emb = tf.reshape(qb_emb, [N, QL, 2 * dbpeh])
                    cb_emb = tf.reshape(cb_emb, [N, PL, 2 * dbpeh])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            if config.use_char:
                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

            if config.use_bpe:
                c_emb = tf.concat([c_emb, cb_emb], axis=2)
                q_emb = tf.concat([q_emb, qb_emb], axis=2)

            if config.use_pos:
                cp_emb = tf.nn.embedding_lookup(self.pos_mat, self.cp)
                qp_emb = tf.nn.embedding_lookup(self.pos_mat, self.qp)
                c_emb = tf.concat([c_emb, cp_emb], axis=2)
                q_emb = tf.concat([q_emb, qp_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
Ejemplo n.º 15
0
    def ptrspan(self):
        config = self.config
        N, QL, CL, d, dc, dg = config.batch_size, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden

        gru = cudnn_gru if config.use_cudnn else native_gru
        SN = self.k
        W = config.glove_dim
        d = config.hidden

        print('embedding part')

        with tf.name_scope("word"):
            para_emb = tf.nn.embedding_lookup(self.word_mat, self.para_slice)
            c_emb = self.sentence_slice
            q_emb = self.q_slice

        with tf.name_scope("para_encode"):

            para_emb_linear = tf.layers.dense(
                para_emb,
                d,
                use_bias=False,
                kernel_initializer=tf.ones_initializer(),
                trainable=self.is_train,
                name='para_emb_line')
            q_emb_linear = tf.layers.dense(
                q_emb,
                d,
                use_bias=False,
                kernel_initializer=tf.ones_initializer(),
                trainable=self.is_train,
                name='q_emb_line')
            align_pq = tf.matmul(para_emb_linear,
                                 tf.transpose(q_emb_linear, [0, 2, 1]))
            pq_mask = tf.tile(tf.expand_dims(self.q_mask, axis=1),
                              [1, self.para_maxlen, 1])
            align_pq = tf.nn.softmax(softmax_mask(align_pq, pq_mask))
            align_para_emb = tf.matmul(align_pq, q_emb_linear)
            para_emb_concat = tf.concat([
                para_emb, align_para_emb, self.para_e_slice, self.para_t_slice
            ],
                                        axis=2)
            self.para_emb = para_emb_concat

        print('encode-part')
        # c_emb = self.sentence_slice

        c_emb_sen = tf.unstack(c_emb, axis=1)
        sentence_len = tf.unstack(self.sentence_len, axis=1)
        c_s = []
        with tf.variable_scope("sentence_encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb_sen[0].get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)

            print('passage-encoder')
            for i in range(SN):
                c_s_emb = rnn(c_emb_sen[i],
                              seq_len=sentence_len[i],
                              concat_layers=False)

                c_s.append(c_s_emb)
            para_gru = rnn(para_emb_concat,
                           seq_len=self.para_len,
                           concat_layers=False)

        with tf.variable_scope("q_encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=q_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            q = rnn(q_emb, seq_len=self.q_len, concat_layers=False)

        # c_s_h = []
        # with tf.variable_scope("highway_encoding",reuse = tf.AUTO_REUSE):
        #     highway = Highway(hidden_size=2*d,is_train=self.is_train)
        #     for i in range(SN):
        #         c_s_highway = highway(c_s[i])
        #         c_s_h.append(c_s_highway)
        #     para_gru = highway(para_gru)
        #     q = highway(q)
        # c_s = c_s_h

        print('qc_att')
        self.c_s = c_s
        self.para_gru = para_gru
        qc_att = []
        sen_mask = tf.unstack(self.sentence_mask, axis=1)
        with tf.variable_scope("sentence_attention", reuse=tf.AUTO_REUSE):
            for i in range(SN):
                qc_att_sample = dot_attention(c_s[i],
                                              q,
                                              mask=self.q_mask,
                                              hidden=d,
                                              keep_prob=config.keep_prob,
                                              is_train=self.is_train)
                qc_att.append(qc_att_sample)

            para_att = dot_attention(para_gru,
                                     q,
                                     mask=self.q_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)

        att_s = []
        with tf.variable_scope("sentence_qcatt_rnn"):
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att[0].get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            for i in range(SN):
                att_s_single = rnn(qc_att[i], seq_len=sentence_len[i])
                att_s.append(att_s_single)
            para_s = rnn(para_att, seq_len=self.para_len)

        self.sentence_att = qc_att
        self.para_att = para_att

        self_att = []

        with tf.variable_scope("sentence_cpattention", reuse=tf.AUTO_REUSE):
            for i in range(SN):
                self_att_single = dot_attention(att_s[i],
                                                para_s,
                                                mask=self.para_mask,
                                                hidden=d,
                                                keep_prob=config.keep_prob,
                                                is_train=self.is_train)
                self_att.append(self_att_single)

        with tf.variable_scope("para_selfattn"):
            # self.para_enc_slice, mask = self.para_enc_mask_slice,
            para_self_att = dot_attention(para_s,
                                          para_s,
                                          mask=self.para_mask,
                                          hidden=d,
                                          keep_prob=config.keep_prob,
                                          is_train=self.is_train)

        self.sentence_selfatt = self_att
        self.para_selfatt = para_self_att

        match = []
        with tf.variable_scope("sentence_cp_rnn"):
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att[0].get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            for i in range(SN):
                match_single = rnn(self_att[i], seq_len=sentence_len[i])
                match.append(match_single)
            para_match = rnn(para_self_att, seq_len=self.para_len)
        self.match = match

        dense_prob = []
        dense_con = []
        with tf.variable_scope("dense_prob", reuse=tf.AUTO_REUSE):
            for i in range(SN):
                sentence_con = tf.concat([c_s[i], att_s[i], match[i]], axis=2)
                prob = dense_summ(sentence_con,
                                  d,
                                  mask=sen_mask[i],
                                  keep_prob=config.keep_prob,
                                  is_train=self.is_train)
                dense_prob.append(prob)
                dense_con.append(sentence_con)
            # with tf.variable_scope("para_prob"):
            para_con = tf.concat([para_gru, para_s, para_match], axis=2)
            para_prob = dense_summ(para_con,
                                   d,
                                   mask=self.para_mask,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            dense_prob.append(para_prob)
            dense_prob = tf.concat(dense_prob, axis=1)
            self.topk = tf.nn.softmax(dense_prob)

        batch_nums = tf.range(0, limit=N)
        batch_nums = tf.expand_dims(batch_nums, 1)
        batch_nums = tf.tile(batch_nums, [1, self.sentence_maxlen])
        lo_shape = tf.constant([N, config.para_limit])

        sentence_index_slice = tf.unstack(self.sentence_index_slice, axis=1)
        # how to ensure the probability
        # sentence1,sentence2,setence3,q,para =?*4

        lo1 = []
        lo2 = []
        with tf.variable_scope("sentence_pointer", reuse=tf.AUTO_REUSE):

            self.init = summ(q[:, :, -2 * d:],
                             d,
                             mask=self.q_mask,
                             keep_prob=config.keep_prob,
                             is_train=self.is_train)
            pointer = ptr_net_span(batch=N,
                                   hidden=self.init.get_shape().as_list()[-1],
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            indice_test = []
            lo1_test = []
            lo2_test = []
            present = []
            present_inp = []

            for i in range(SN):
                logits1, logits2, inp1, inp2 = pointer(self.init, dense_con[i],
                                                       d, sen_mask[i])
                logits1 = logits1 * tf.cast(sen_mask[i], tf.float32)
                logits2 = logits2 * tf.cast(sen_mask[i], tf.float32)
                indice = tf.stack([batch_nums, sentence_index_slice[i]],
                                  axis=2)
                inp = tf.stack([inp1, inp2], axis=1)
                present.append(inp)
                present_inp.append(inp2)
                lo1_test.append(logits1)
                lo2_test.append(logits2)
                indice_test.append(indice)

            self.lo1 = lo1_test[0]
            self.lo2 = lo1_test[1]
            self.lo3 = lo1_test[2]

            lo1 = [
                tf.slice(tf.scatter_nd(in1, in2, lo_shape), [0, 0],
                         [N, self.para_maxlen])
                for (in1, in2) in zip(indice_test, lo1_test)
            ]
            lo2 = [
                tf.slice(tf.scatter_nd(in1, in2, lo_shape), [0, 0],
                         [N, self.para_maxlen])
                for (in1, in2) in zip(indice_test, lo2_test)
            ]

            with tf.variable_scope("para_pointer"):
                para_pointer = ptr_net_span(
                    batch=N,
                    hidden=self.init.get_shape().as_list()[-1],
                    keep_prob=config.keep_prob,
                    is_train=self.is_train)
                para_lo1, para_lo2, inp1, inp2 = para_pointer(
                    self.init, para_match, d, self.para_mask)
                present_para = tf.stack([inp1, inp2], axis=1)
                para_lo1 = softmax_mask(para_lo1, self.para_mask)
                para_lo2 = softmax_mask(para_lo2, self.para_mask)
            present.append(tf.tile(present_para, [1, 1, 3]))
            present_inp.append(inp2)
            lo1.append(para_lo1)
            lo2.append(para_lo2)
            self.lo4 = para_lo2
            self.present = tf.stack(present, axis=2)
            out_lo1 = tf.stack(lo1, axis=1)
            out_lo2 = tf.stack(lo2, axis=1)
            out_lo1 = (tf.expand_dims(self.topk, axis=2)) * out_lo1
            out_logits1 = tf.reduce_sum(out_lo1, axis=1)
            # out_logits1 = tf.slice(out_logits1, [0, 0], [N, self.para_maxlen])
            # out_logits1 = softmax_mask(out_logits1, self.para_mask)
            out_lo2 = (tf.expand_dims(self.topk, axis=2)) * out_lo2
            out_logits2 = tf.reduce_sum(out_lo2, axis=1)
            # out_logits2 = tf.slice(out_logits2, [0, 0], [N, self.para_maxlen])
            # out_logits2 = softmax_mask(out_logits2, self.para_mask)

            self.out_lo1 = out_lo1
            self.out_lo2 = out_logits1

            # out_logits1 = tf.nn.softmax(out_logits1)
            # out_logits2 = tf.nn.softmax(out_logits2)
            outer = tf.matmul(
                tf.expand_dims(tf.nn.softmax(out_logits1), axis=2),
                tf.expand_dims(tf.nn.softmax(out_logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)

        with tf.variable_scope("predict"):

            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=out_logits1, labels=tf.stop_gradient(self.y1_slice))
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=out_logits2, labels=tf.stop_gradient(self.y2_slice))
            prob_y1 = tf.expand_dims(tf.reduce_max(tf.reduce_max(outer,
                                                                 axis=2),
                                                   axis=1),
                                     axis=1)
            prob_y2 = tf.expand_dims(tf.reduce_max(tf.reduce_max(outer,
                                                                 axis=1),
                                                   axis=1),
                                     axis=1)
            prob = tf.concat([prob_y1, prob_y2], axis=1)
            lossRL = -tf.log(prob) * self.reward_Diff
            self.out1 = losses

            self.out2 = losses2
            loss = tf.concat([
                tf.expand_dims(losses, axis=1),
                tf.expand_dims(losses2, axis=1)
            ],
                             axis=1)
            final_reward = loss * self.reward_Diff
            self.loss3 = tf.reduce_mean((losses + losses2))
            lam = config.lam
            self.loss_span = tf.reduce_mean(final_reward)
Ejemplo n.º 16
0
def decoder(target, state, params):
    mask = dtype.tf_to_float(tf.cast(target, tf.bool))
    hidden_size = params.hidden_size
    initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5)

    is_training = ('decoder' not in state)

    if is_training:
        target, mask = util.remove_invalid_seq(target, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "tgt_embedding"
    tgt_emb = tf.get_variable(embed_name,
                              [params.tgt_vocab.size(), params.embed_size],
                              initializer=initializer)
    tgt_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5)
    inputs = tf.nn.bias_add(inputs, tgt_bias)

    # shift
    if is_training:
        inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]])
        inputs = inputs[:, :-1, :]
        inputs = func.add_timing_signal(inputs)
    else:
        inputs = tf.cond(
            tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())),
            lambda: tf.zeros_like(inputs), lambda: inputs)
        mask = tf.ones_like(mask)
        inputs = func.add_timing_signal(inputs,
                                        time=dtype.tf_to_float(state['time']))

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    with tf.variable_scope("decoder"):
        x = inputs
        for layer in range(params.num_decoder_layer):
            if params.deep_transformer_init:
                layer_initializer = tf.variance_scaling_initializer(
                    params.initializer_gain * (layer + 1)**-0.5,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                layer_initializer = None
            with tf.variable_scope("layer_{}".format(layer),
                                   initializer=layer_initializer):
                with tf.variable_scope("average_attention"):
                    x_fwds = []
                    for strategy in params.strategies:
                        with tf.variable_scope(strategy):
                            x_fwd = average_attention_strategy(
                                strategy, x, mask, state, layer, params)
                            x_fwds.append(x_fwd)
                    x_fwd = tf.add_n(x_fwds) / len(x_fwds)

                    # FFN activation
                    if params.use_ffn:
                        y = func.ffn_layer(
                            x_fwd,
                            params.filter_size,
                            hidden_size,
                            dropout=params.relu_dropout,
                        )
                    else:
                        y = x_fwd

                    # Gating layer
                    z = func.linear(tf.concat([x, y], axis=-1),
                                    hidden_size * 2,
                                    scope="z_project")
                    i, f = tf.split(z, 2, axis=-1)
                    y = tf.sigmoid(i) * x + tf.sigmoid(f) * y

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("cross_attention"):
                    y = func.dot_attention(
                        x,
                        state['encodes'],
                        func.attention_bias(state['mask'], "masking"),
                        hidden_size,
                        num_heads=params.num_heads,
                        dropout=params.attention_dropout,
                        cache=None if is_training else
                        state['decoder']['state']['layer_{}'.format(layer)])
                    if not is_training:
                        # mk, mv
                        state['decoder']['state']['layer_{}'.format(layer)]\
                            .update(y['cache'])

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("feed_forward"):
                    y = func.ffn_layer(
                        x,
                        params.filter_size,
                        hidden_size,
                        dropout=params.relu_dropout,
                    )

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)
    feature = x
    if 'dev_decode' in state:
        feature = x[:, -1, :]

    embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \
        else "softmax_embedding"
    embed_name = "embedding" if params.shared_source_target_embedding \
        else embed_name
    softmax_emb = tf.get_variable(embed_name,
                                  [params.tgt_vocab.size(), params.embed_size],
                                  initializer=initializer)
    feature = tf.reshape(feature, [-1, params.embed_size])
    logits = tf.matmul(feature, softmax_emb, False, True)

    logits = tf.cast(logits, tf.float32)

    soft_label, normalizer = util.label_smooth(target,
                                               util.shape_list(logits)[-1],
                                               factor=params.label_smooth)
    centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                          labels=soft_label)
    centropy -= normalizer
    centropy = tf.reshape(centropy, tf.shape(target))

    mask = tf.cast(mask, tf.float32)
    per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(
        mask, -1)
    loss = tf.reduce_mean(per_sample_loss)

    # these mask tricks mainly used to deal with zero shapes, such as [0, 1]
    loss = tf.cond(tf.equal(tf.shape(target)[0], 0),
                   lambda: tf.constant(0, dtype=tf.float32), lambda: loss)

    return loss, logits, state, per_sample_loss
Ejemplo n.º 17
0
    def ready(self):
        config = self.config
        N, QL, CL, d, dc, dg = config.batch_size, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru
        SN, SL = self.c_s_maxnum, self.c_s_maxlen
        W = config.glove_dim
        print('embedding part')
        with tf.variable_scope("emb"):
            # with tf.variable_scope("char"):
            #         ch_emb = tf.reshape(tf.nn.embedding_lookup(
            #             self.char_mat, self.csh_slice), [N, SN * SL, CL, dc], name='char_reshape')
            #         qh_emb = tf.reshape(tf.nn.embedding_lookup(
            #             self.char_mat, self.qh_slice), [N, QL, CL, dc])
            #         ch_emb = dropout(
            #             ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
            #         qh_emb = dropout(
            #             qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
            # ch_emb_char = tf.unstack(ch_emb, axis=0)
            # qh_emb_char = tf.unstack(qh_emb, axis=0)
            '''

            filter_size = [3, 4, 5]
            att_char = []
            merge_char = []
            q_merge_char = []
            for filter in filter_size:
                with tf.variable_scope("char-cnnencoder-%s" % filter):
                    step_merge_char = []
                    step_att_char = []
                    q_step_merge_char = []
                    q_step_att_char = []
                    for i in range(2):
                        if i==0:
                            input_char=ch_emb
                        else:
                            input_char=qh_emb
                        conv_branch_char = tf.layers.conv2d(
                            inputs=input_char,
                            # use as many filters as the hidden size
                            filters=50,
                            kernel_size=filter,
                            use_bias=True,
                            activation=tf.nn.relu,
                            trainable=True,
                            padding='SAME',
                            name = 'conv_char_' + str(filter),
                            reuse = tf.AUTO_REUSE,
                            data_format='channels_last'
                        )
                        if i ==0:
                            step_att_char.append(conv_branch_char)
                            # pool over the words to obtain: [first_dim x 1* hidden_size]
                            pool_branch_char = tf.reduce_max(conv_branch_char, axis=2)
                            merge_char.append(pool_branch_char)
                        else:
                            q_step_att_char.append(conv_branch_char)
                            # pool over the words to obtain: [first_dim x 1* hidden_size]
                            q_pool_branch_char = tf.reduce_max(conv_branch_char, axis=2)
                            q_merge_char.append(q_pool_branch_char)
                    # batch_merge = tf.stack(step_merge_char, axis=0)
                    # merge_char.append(batch_merge)
                    # batch_merge_q = tf.stack(q_step_merge_char, axis=0)
                    # q_merge_char.append(batch_merge_q)
            ch_con = tf.concat(merge_char, axis=-1)
            ch_con = tf.reshape(ch_con,[N,SN,SL,150])
            qh_con = tf.concat(q_merge_char,axis=-1)
            '''
            # if(use_char):
            #     with tf.variable_scope("char"):
            #         ch_emb = tf.reshape(tf.nn.embedding_lookup(
            #             self.char_mat, self.csh), [N * SN * SL, CL, dc], name='char_reshape')
            #         qh_emb = tf.reshape(tf.nn.embedding_lookup(
            #             self.char_mat, self.qh), [N * QL, CL, dc])
            #         ch_emb = dropout(
            #             ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
            #         qh_emb = dropout(
            #             qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
            #         cell_fw = tf.contrib.rnn.GRUCell(dg)
            #         cell_bw = tf.contrib.rnn.GRUCell(dg)
            #         _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
            #             cell_fw, cell_bw, ch_emb, self.csh_len, dtype=tf.float32)
            #         ch_emb = tf.concat([state_fw, state_bw], axis=1)
            #         _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
            #             cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
            #         qh_emb = tf.concat([state_fw, state_bw], axis=1)
            #         qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
            #         ch_emb = tf.reshape(ch_emb, [N, SN, SL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.cs_slice)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q_slice)

            with tf.name_scope("softemb"):
                c_emb_linear = tf.nn.relu(
                    dense(c_emb, d, use_bias=True, scope="c_emb_linear"))
                q_emb_linear = tf.nn.relu(
                    dense(q_emb, d, use_bias=True, scope="q_emb_linear"))
                c_emb_linear = tf.reshape(
                    c_emb_linear, [N, self.c_s_maxnum * self.c_s_maxlen, d])
                align_cq = tf.matmul(c_emb_linear,
                                     tf.transpose(q_emb_linear, [0, 2, 1]))

                cq_mask = tf.tile(tf.expand_dims(self.q_mask, axis=1),
                                  [1, self.c_s_maxnum * self.c_s_maxlen, 1])
                self.align_cq = tf.nn.softmax(softmax_mask(align_cq, cq_mask))
                align_c_emb = tf.matmul(self.align_cq, q_emb_linear)
                align_c_emb = tf.reshape(
                    align_c_emb, [N, self.c_s_maxnum, self.c_s_maxlen, d])
            c_emb = tf.concat(
                [c_emb, align_c_emb, self.ce_slice, self.ct_slice], axis=3)
            c_emb = tf.reshape(
                c_emb, [N, self.c_s_maxnum, self.c_s_maxlen, W + d + 3 + 19],
                name='c_emb_reshape')

            q_emb = tf.concat([q_emb, self.qt_slice], axis=2)
            self.c_emb = c_emb
            self.q_emb = q_emb
            # c_emb = tf.reshape(c_emb, [N,self.c_s_maxnum,self.c_s_maxlen,W+self.q_maxlen])

        print('encode-part')
        # c_s_len = tf.unstack(self.c_s_len, axis=1)

        cnn_out = []
        c_s_emb = tf.unstack(c_emb, axis=0)
        # q_s_emb = tf.expand_dims(q_emb, axis=1)
        # q_sample_emb = tf.unstack(q_s_emb, axis = 0)

        filter_size = [3, 4, 5]
        att = []
        merge = []
        q_merge = []
        with tf.variable_scope("cnnencoder"):
            for filter in filter_size:
                step_merge = []
                step_att = []
                q_step_merge = []
                q_step_att = []
                with tf.variable_scope("cnnencoder-%s" % filter):
                    for i in range(N):
                        conv_branch = tf.layers.conv1d(
                            inputs=c_s_emb[i],
                            # use as many filters as the hidden size
                            filters=100,
                            kernel_size=[filter],
                            use_bias=True,
                            activation=tf.nn.relu,
                            trainable=True,
                            padding='SAME',
                            name='conv_' + str(filter),
                            reuse=tf.AUTO_REUSE)
                        # tf.get_variable_scope().reuse_variables()
                        step_att.append(conv_branch)
                        # pool over the words to obtain: [first_dim x 1* hidden_size]
                        pool_branch = tf.reduce_max(conv_branch, axis=1)
                        pool_branch = dropout(pool_branch,
                                              keep_prob=config.keep_prob,
                                              is_train=self.is_train)
                        step_merge.append(pool_branch)

                batch_merge = tf.stack(step_merge, axis=0)
                merge.append(batch_merge)
                # batch_merge_q = tf.stack(q_step_merge, axis = 0)
                # q_merge.append(batch_merge_q)

                con = tf.concat(merge, axis=-1)
                # q_con = tf.concat(q_merge, axis = -1)
                #
                # attention_vis = tf.stack(att, axis=0)
                # attention_vis = tf.reduce_mean(attention_vis, axis=0)
                # cnn_out.append(con)
                # c_sen_emb = tf.concat(con, axis = 0)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=con.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            print('passage-encoder')
            c_s = rnn(con, seq_len=self.c_p_len)
            # q = rnn(q_emb, seq_len=self.q_len)
        with tf.variable_scope("qencode"):
            with tf.variable_scope("encoding"):
                rnn = gru(num_layers=3,
                          num_units=d,
                          batch_size=N,
                          input_size=q_emb.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)

                q = rnn(q_emb, seq_len=self.q_len)
        self.q_enc = q
        print('qc_att')

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c_s,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)

            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            self.att_s = rnn(qc_att, seq_len=self.c_p_len)

        # print('pointer')
        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train,
                              is_sentence=True)

            logits1 = pointer(init, self.att_s, d, self.c_p_mask)
            self.lo = logits1
        with tf.variable_scope("predict"):
            self.outer = tf.nn.softmax(logits1)
            self.yp = tf.argmax(self.outer, axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=tf.stop_gradient(self.y_slice))
            self.out1 = tf.nn.top_k(self.outer, config.k).values
            self.policy = tf.nn.top_k(self.outer, 1).values
            self.policy = tf.reduce_sum(tf.nn.top_k(self.outer,
                                                    config.k).values,
                                        axis=-1,
                                        keepdims=True)
            self.policy_log_part = tf.log(self.policy)
            #self.loss = tf.reduce_mean(-1 * self.policy_log_part * self.reward)
            reward = self.advantage
            reward_mean, reward_var = tf.nn.moments(reward, axes=[0])

            reward_std = tf.sqrt(reward_var) + 1e-6
            self.reward_mean = reward_mean
            self.reward_var = reward_std
            reward = tf.div(reward - reward_mean, reward_std)

            self.final_reward = reward - self.baseline
            self.loss = tf.reduce_mean(-1 * self.policy_log_part *
                                       self.advantage)
Ejemplo n.º 18
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope('emb'):
            with tf.variable_scope('char'):
                ch_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.ch), [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.qh), [N * QL, CL, dc])
                ch_emb = dropout(
                    ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                qh_emb = dropout(
                    qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope('word'):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope('encoding'):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope('attention'):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope('match'):
            self_att = dot_attention(
                att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope('pointer'):
            init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
            )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope('predict'):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=tf.stop_gradient(self.y1))
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=tf.stop_gradient(self.y2))
            self.loss = tf.reduce_mean(losses + losses2)
Ejemplo n.º 19
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.ch), [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.qh), [N * QL, CL, dc])
                ch_emb = dropout(
                    ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                qh_emb = dropout(
                    qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            self.c_emb = tf.stop_gradient(c_emb)
            self.q_emb = tf.stop_gradient(q_emb)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            self.c_rnn = c = rnn(c_emb, seq_len=self.c_len)
            self.q_rnn = q = rnn(q_emb, seq_len=self.q_len)

            c = tf.stop_gradient(c)
            q = tf.stop_gradient(q)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            
            self.att = [rnn(qc_att, seq_len=self.c_len)]
            self.att += [self.att[-1][:,-1,:]]
        
        with tf.variable_scope("binary"):
            for _ in range(3):
                self.att += [tf.nn.dropout(tf.keras.layers.Dense(300)(self.att[-1]), keep_prob=config.keep_prob)]

        with tf.variable_scope("badptr"):
            init = self.att[-1]
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
            )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            logits1, logits2 = pointer(init, self.att[0], d, self.c_mask)

        with tf.variable_scope("badptr_predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1_distrib = tf.reduce_max(outer, axis=2)
            self.yp2_distrib = tf.reduce_max(outer, axis=1)
            self.yp1 = tf.argmax(self.yp1_distrib, axis=1)
            self.yp2 = tf.argmax(self.yp2_distrib, axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=tf.stop_gradient(self.y1))
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=tf.stop_gradient(self.y2))
            self.loss = tf.reduce_mean(losses + losses2)
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        gi = []
        att_vP = []

        for i in range(config.max_para):
            print(i)
            with tf.variable_scope("emb" + str(i)):
                with tf.variable_scope("char" + str(i)):
                    #CL = tf.Print(CL,[CL],message="CL:")
                    #PL = tf.Print(PL,[PL],message="PL:")
                    #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:")
                    self.ch_pr_ = self.ch_pr[:, i * 400:(i + 1) * 400, :]
                    print(self.ch_pr_.get_shape())
                    #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL])
                    #print(self.ch.get_shape())
                    #print(self.ch_pr.get_shape())
                    #print(self.c.get_shape())
                    #print(self.c_pr.get_shape())
                    #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr")
                    ch_emb = tf.reshape(tf.nn.embedding_lookup(\
                     self.char_mat, self.ch_pr_), [N * PL, CL, dc])
                    #	self.char_mat, self.ch), [N * PL, CL, dc])
                    qh_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.char_mat, self.qh),
                        [N * QL, CL, dc])
                    ch_emb = dropout(ch_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb")
                    #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb")
                    qh_emb = dropout(qh_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    cell_fw = tf.contrib.rnn.GRUCell(dg)
                    cell_bw = tf.contrib.rnn.GRUCell(dg)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        ch_emb,
                        self.ch_len,
                        dtype=tf.float32)
                    ch_emb = tf.concat([state_fw, state_bw], axis=1)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        qh_emb,
                        self.qh_len,
                        dtype=tf.float32)
                    #state_fw = tf.Print(state_fw,[state_fw],message="state_fw")
                    #state_bw = tf.Print(state_bw,[state_bw],message="state_bw")
                    qh_emb = tf.concat([state_fw, state_bw], axis=1)
                    qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                    ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])
                    #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb")
                with tf.name_scope("word" + str(i)):
                    c_emb = tf.nn.embedding_lookup(
                        self.word_mat, self.c_pr[:, i * 400:(i + 1) * 400])
                    q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

            with tf.variable_scope("encoding" + str(i)):
                rnn = gru(num_layers=3,
                          num_units=d,
                          batch_size=N,
                          input_size=c_emb.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)
                c = rnn(c_emb, seq_len=self.c_len)
                q = rnn(q_emb, seq_len=self.q_len)

            with tf.variable_scope("attention" + str(i)):
                qc_att = dot_attention(c,
                                       q,
                                       mask=self.q_mask,
                                       hidden=d,
                                       keep_prob=config.keep_prob,
                                       is_train=self.is_train)
                rnn = gru(num_layers=1,
                          num_units=d,
                          batch_size=N,
                          input_size=qc_att.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)
                att = rnn(qc_att, seq_len=self.c_len)
                # att is the v_P
                if i == 0:
                    att_vP = att
                else:
                    att_vP = tf.concat([att_vP, att], axis=1)
                #att = tf.Print(att,[att],message="att:")
                print("att:", att.get_shape().as_list())
                print("att_vP:", att_vP.get_shape().as_list())
            #att_vP = tf.Print(att_vP,[tf.shape(att_vP)],message="att_vP:")
            """
			with tf.variable_scope("match"):
				self_att = dot_attention(
					att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
				rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
				).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
				match = rnn(self_att, seq_len=self.c_len)
			"""
        with tf.variable_scope("pointer"):

            # r_Q:
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            print("rQ:", init.get_shape().as_list())
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, att, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            #losses1_2 = tf.reduce_mean(losses1_2, axis=0)
            self.loss = tf.reduce_mean(losses + losses2)

            # print losses
            #condition = tf.greater(self.loss, 11)
            #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1)
            #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1)

        if config.with_passage_ranking:
            gi = None
            for i in range(config.max_para):
                # Passage ranking
                with tf.variable_scope("passage-ranking-attention" + str(i)):

                    #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:")
                    vj_P = att_vP[:, i * 400:(i + 1) * 400, :]
                    pr_att = pr_attention(
                        batch=N,
                        hidden=init.get_shape().as_list()[-1],
                        keep_prob=config.keep_prob,
                        is_train=self.is_train)
                    r_P = pr_att(init, vj_P, d, self.c_mask)
                    #r_P = tf.Print(r_P,[r_P],message="r_p")
                    # Wg
                    concatenate = tf.concat([init, r_P], axis=1)
                    g = tf.nn.tanh(
                        dense(concatenate,
                              hidden=d,
                              use_bias=False,
                              scope="g" + str(i)))
                    g_ = dense(g, 1, use_bias=False, scope="g_" + str(i))
                    #g = tf.Print(g,[g],message="g")
                    if i == 0:
                        gi = tf.reshape(g_, [N, 1])
                    else:
                        gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1)
            #gi_ = tf.convert_to_tensor(gi,dtype=tf.float32)
            #self.gi = tf.nn.softmax(gi_)
            #self.losses3 = tf.nn.softmax_cross_entropy_with_logits(
            #			logits=gi_, labels=tf.reshape(self.pr,[-1,1]))
            self.losses3 = tf.nn.softmax_cross_entropy_with_logits(
                logits=gi, labels=self.pr)
            #self.losses3 = tf.Print(self.losses3,[self.losses3,tf.reduce_max(self.losses3),
            #	tf.reduce_max(self.pr),tf.reduce_max(gi)],message="losses3:")
            self.pr_loss = tf.reduce_mean(self.losses3)
            #self.pr_loss = tf.Print(self.pr_loss,[self.pr_loss])
            self.r = tf.constant(0.8)
            self.e_loss1 = tf.multiply(self.r, self.loss)
            self.e_loss2 = tf.multiply(tf.subtract(tf.constant(1.0), self.r),
                                       self.pr_loss)
            self.e_loss = tf.add(self.e_loss1, self.e_loss2)
Ejemplo n.º 21
0
def decoder(target, state, params):
    mask = dtype.tf_to_float(tf.cast(target, tf.bool))
    hidden_size = params.hidden_size
    initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5)

    is_training = ('decoder' not in state)

    if is_training:
        target, mask = util.remove_invalid_seq(target, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "tgt_embedding"
    tgt_emb = tf.get_variable(embed_name,
                              [params.tgt_vocab.size(), params.embed_size],
                              initializer=initializer)
    tgt_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5)
    inputs = tf.nn.bias_add(inputs, tgt_bias)

    # shift
    if is_training:
        inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]])
        inputs = inputs[:, :-1, :]
        inputs = func.add_timing_signal(inputs)
    else:
        inputs = tf.cond(
            tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())),
            lambda: tf.zeros_like(inputs), lambda: inputs)
        mask = tf.ones_like(mask)
        inputs = func.add_timing_signal(inputs,
                                        time=dtype.tf_to_float(state['time']))

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    # Applying L0Drop
    # --------
    source_memory = state["encodes"]
    source_mask = state["mask"]

    # source_pruning: log alpha_i = x_i w^T
    source_pruning = func.linear(source_memory, 1, scope="source_pruning")

    if is_training:  # training
        source_memory, l0_mask = l0norm.var_train(
            (source_memory, source_pruning))
        l0_norm_loss = tf.squeeze(l0norm.l0_norm(source_pruning), -1)
        l0_norm_loss = tf.reduce_sum(l0_norm_loss * source_mask,
                                     -1) / tf.reduce_sum(source_mask, -1)
        l0_norm_loss = tf.reduce_mean(l0_norm_loss)
        l0_norm_loss = l0norm.l0_regularization_loss(
            l0_norm_loss,
            reg_scalar=params.l0_norm_reg_scalar,
            start_reg_ramp_up=params.l0_norm_start_reg_ramp_up,
            end_reg_ramp_up=params.l0_norm_end_reg_ramp_up,
            warm_up=params.l0_norm_warm_up,
        )

        # force the model to only attend to unmasked position
        source_mask = dtype.tf_to_float(
            tf.cast(tf.squeeze(l0_mask, -1), tf.bool)) * source_mask
    else:  # evaluation
        source_memory, l0_mask = l0norm.var_eval(
            (source_memory, source_pruning))
        l0_norm_loss = 0.0

        source_memory, source_mask, count_mask = extract_encodes(
            source_memory, source_mask, l0_mask)
        count_mask = tf.expand_dims(tf.expand_dims(count_mask, 1), 1)
    # --------

    with tf.variable_scope("decoder"):
        x = inputs
        for layer in range(params.num_decoder_layer):
            if params.deep_transformer_init:
                layer_initializer = tf.variance_scaling_initializer(
                    params.initializer_gain * (layer + 1)**-0.5,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                layer_initializer = None
            with tf.variable_scope("layer_{}".format(layer),
                                   initializer=layer_initializer):
                with tf.variable_scope("self_attention"):
                    y = func.dot_attention(
                        x,
                        None,
                        func.attention_bias(tf.shape(mask)[1], "causal"),
                        hidden_size,
                        num_heads=params.num_heads,
                        dropout=params.attention_dropout,
                        cache=None if is_training else
                        state['decoder']['state']['layer_{}'.format(layer)])
                    if not is_training:
                        # k, v
                        state['decoder']['state']['layer_{}'.format(layer)] \
                            .update(y['cache'])

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("cross_attention"):
                    if is_training:
                        y = func.dot_attention(
                            x,
                            source_memory,
                            func.attention_bias(source_mask, "masking"),
                            hidden_size,
                            num_heads=params.num_heads,
                            dropout=params.attention_dropout,
                        )
                    else:
                        y = dot_attention(x,
                                          source_memory,
                                          func.attention_bias(
                                              source_mask, "masking"),
                                          hidden_size,
                                          count_mask=count_mask,
                                          num_heads=params.num_heads,
                                          dropout=params.attention_dropout,
                                          cache=state['decoder']['state'][
                                              'layer_{}'.format(layer)])

                        # mk, mv
                        state['decoder']['state']['layer_{}'.format(layer)] \
                            .update(y['cache'])

                    y = y['output']
                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)

                with tf.variable_scope("feed_forward"):
                    y = func.ffn_layer(
                        x,
                        params.filter_size,
                        hidden_size,
                        dropout=params.relu_dropout,
                    )

                    x = func.residual_fn(x, y, dropout=params.residual_dropout)
                    x = func.layer_norm(x)
    feature = x
    if 'dev_decode' in state:
        feature = x[:, -1, :]

    embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \
        else "softmax_embedding"
    embed_name = "embedding" if params.shared_source_target_embedding \
        else embed_name
    softmax_emb = tf.get_variable(embed_name,
                                  [params.tgt_vocab.size(), params.embed_size],
                                  initializer=initializer)
    feature = tf.reshape(feature, [-1, params.embed_size])
    logits = tf.matmul(feature, softmax_emb, False, True)

    logits = tf.cast(logits, tf.float32)

    soft_label, normalizer = util.label_smooth(target,
                                               util.shape_list(logits)[-1],
                                               factor=params.label_smooth)
    centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                          labels=soft_label)
    centropy -= normalizer
    centropy = tf.reshape(centropy, tf.shape(target))

    mask = tf.cast(mask, tf.float32)
    per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(
        mask, -1)
    loss = tf.reduce_mean(per_sample_loss)

    loss = loss + l0_norm_loss

    # these mask tricks mainly used to deal with zero shapes, such as [0, 1]
    loss = tf.cond(tf.equal(tf.shape(target)[0], 0),
                   lambda: tf.constant(0, tf.float32), lambda: loss)

    return loss, logits, state, per_sample_loss
Ejemplo n.º 22
0
    def __init__(self,
                 config,
                 batch,
                 word_mat=None,
                 char_mat=None,
                 pos_mat=None,
                 filter_sizes=None,
                 embedding_size=None,
                 num_filters=None,
                 trainable=True,
                 l2_reg_lambda=0.0,
                 keep_prob=0.9,
                 graph=None):

        # Placeholders for input, output and dropout
        self.config = config
        self.graph = graph if graph is not None else tf.Graph()
        self.trainable = trainable
        gru = cudnn_gru if config.use_cudnn else native_gru
        self.is_train = tf.get_variable("is_train",
                                        shape=[],
                                        dtype=tf.bool,
                                        trainable=True)
        if trainable == True:
            self.input_x, self.c_pos, self.c_important, self.input_x1, self.q_pos, self.q_important, self.ch, self.qh, self.input_y, self.qa_id, self.alternatives_tokens = batch.get_next(
            )  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        else:
            self.input_x, self.c_pos, self.c_important, self.input_x1, self.q_pos, self.q_important, self.ch, self.qh, self.alternatives_tokens = batch.get_next(
            )  # self.y1 is (64, 3)self.alterh batch_size is[batch,3,alternative_len,chara_len]
        self.dropout_keep_prob = keep_prob
        self.global_step = tf.get_variable(
            'global_step',
            shape=[],
            dtype=tf.int32,
            initializer=tf.constant_initializer(0),
            trainable=False)
        self.dropout = tf.placeholder_with_default(0.5, (), name="dropout")
        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
        self.c_mask = tf.cast(
            self.input_x, tf.bool)  # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400)
        self.q_mask = tf.cast(self.input_x1, tf.bool)  # 同上(64,50)
        self.c_pos_mask = tf.cast(
            self.c_pos, tf.bool)  # 这里是判断出每一个数据集的context对应实际句子长度的位置(64,400)
        self.q_pos_mask = tf.cast(self.q_pos, tf.bool)  # 同上(64,50)
        self.c_important_mask = tf.cast(self.c_important, tf.bool)
        self.q_important_mask = tf.cast(self.q_important, tf.bool)

        self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32),
                                   axis=1)  # 每一个训练数据集实际长度
        self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32),
                                   axis=1)  # 每一个问题的实际长度
        self.c_pos_len = tf.reduce_sum(tf.cast(self.c_pos_mask, tf.int32),
                                       axis=1)  # 每一个训练数据集实际长度
        self.q_pos_len = tf.reduce_sum(tf.cast(self.q_pos_mask, tf.int32),
                                       axis=1)  # 每一个问题的实际长度
        self.c_important_len = tf.reduce_sum(tf.cast(self.c_important_mask,
                                                     tf.int32),
                                             axis=1)
        self.q_important_len = tf.reduce_sum(tf.cast(self.q_important_mask,
                                                     tf.int32),
                                             axis=1)

        self.ch_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32),
                          axis=2), [-1])
        self.qh_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32),
                          axis=2), [-1])
        # Embedding layer
        N, PL, QL, CL, d, dc,dg= config.batch_size,config.para_limit,config.ques_limit,config.char_limit,\
                                 config.hidden, config.char_dim,config.char_hidden
        self.words_embedding = tf.get_variable("word_mat",
                                               initializer=tf.constant(
                                                   word_mat, dtype=tf.float32),
                                               trainable=True)
        self.pos_W_embedding = tf.get_variable("pos_mat",
                                               initializer=tf.constant(
                                                   pos_mat, dtype=tf.float32),
                                               trainable=True)
        self.char_mat = tf.get_variable("char_mat",
                                        initializer=tf.constant(
                                            char_mat, dtype=tf.float32),
                                        trainable=True)
        with tf.variable_scope("Input_Embedding_Layer"):  #字符表示方法
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)
            cell_fw = tf.contrib.rnn.GRUCell(dg)  # 按照字符有多少个gru神经单元
            cell_bw = tf.contrib.rnn.GRUCell(dg)
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32
            )  # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because
            # char_hidden is 100 so state_fw and state_bw is [N * PL,100]
            ch_emb = tf.concat([state_fw, state_bw], axis=1)  # [N * PL,200]
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, qh_emb, self.qh_len,
                dtype=tf.float32)  # state_* [N*QL]
            qh_emb = tf.concat([state_fw, state_bw],
                               axis=1)  # question_emd is [,200]

            qh_emb = tf.reshape(qh_emb,
                                [N, QL, 2 * dg])  # [batch_size,que_len,200]
            ch_emb = tf.reshape(
                ch_emb, [N, PL, 2 * dg]
            )  # 以上过程对应了论文里边的 the character-level embedding are generate by ...in the token
            #这样就把每一个单词的字符转化为单词的字符级别embedding信息,tf.reshape(ch_emb, [N, PL, 2 * dg])
            # 从这里可以看出作者最后那字符的state状态作为字符信息与原始单词embedding进行连接,那么是否可以用拼音
            # 作为汉语的字符级别信息呢,可以尝试
        with tf.variable_scope("Iportant_Embedding_Layer"):
            c_important_emb = tf.nn.embedding_lookup(self.words_embedding,
                                                     self.c_important)
            q_important_emb = tf.nn.embedding_lookup(self.words_embedding,
                                                     self.q_important)
            c_important_emb = tf.nn.dropout(c_important_emb,
                                            1.0 - 0.5 * self.dropout)
            q_important_emb = tf.nn.dropout(q_important_emb,
                                            1.0 - 0.5 * self.dropout)
            cell_fw = tf.contrib.rnn.GRUCell(dg)  # 按照字符有多少个gru神经单元
            cell_bw = tf.contrib.rnn.GRUCell(dg)
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                c_important_emb,
                self.c_important_len,
                dtype=tf.float32
            )  # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because
            # char_hidden is 100 so state_fw and state_bw is [N * PL,100]
            c_important_emb = tf.concat([state_fw, state_bw],
                                        axis=1)  # [N * PL,200]
            _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                q_important_emb,
                self.q_important_len,
                dtype=tf.float32)  # state_* [N*QL]
            q_important_emb = tf.concat([state_fw, state_bw],
                                        axis=1)  # question_emd is [,200]

            print(c_important_emb, "222222222222222")
        with tf.variable_scope("pos_Embedding_Layer"):
            c_pos_em = tf.nn.embedding_lookup(self.pos_W_embedding, self.c_pos)
            q_pos_em = tf.nn.embedding_lookup(self.pos_W_embedding, self.q_pos)
            c_pos_em = tf.nn.dropout(c_pos_em, 1.0 - 0.5 * self.dropout)
            q_pos_em = tf.nn.dropout(q_pos_em, 1.0 - 0.5 * self.dropout)
            cell_fw = tf.contrib.rnn.GRUCell(dg)  # 按照字符有多少个gru神经单元
            cell_bw = tf.contrib.rnn.GRUCell(dg)
            (state_fw, state_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, c_pos_em, self.c_pos_len, dtype=tf.float32
            )  # self.ch_len表示训练数据集所有字符平摊之后,实际字符的长度,sequence_length=[bacth_size] is N * PL, because
            # char_hidden is 100 so state_fw and state_bw is [N * PL,100]
            c_pos_em = tf.concat([state_fw, state_bw], axis=2)  # [N * PL,200]
            (state_fw, state_bw), _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, q_pos_em, self.q_pos_len,
                dtype=tf.float32)  # state_* [N*QL]
            q_pos_em = tf.concat([state_fw, state_bw],
                                 axis=2)  # question_emd is [,200]
            print(c_pos_em, "222222222222222")
        with tf.name_scope("embedding"):
            if trainable:
                self.c_maxlen, self.q_maxlen, = config.para_limit, config.ques_limit,
            else:
                self.c_maxlen, self.q_maxlen = config.test_para_limit, config.test_ques_limit
            self.embedded_chars = tf.nn.embedding_lookup(
                self.words_embedding, self.input_x)
            self.embedded_chars1 = tf.nn.embedding_lookup(
                self.words_embedding, self.input_x1)
            c_emb = tf.concat([self.embedded_chars, ch_emb, c_pos_em], axis=2)
            q_emb = tf.concat([self.embedded_chars1, qh_emb, q_pos_em], axis=2)
            # self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            # self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
        with tf.variable_scope("encoding"):
            rnn = gru(
                num_layers=3,
                num_units=d,
                batch_size=N,
                input_size=c_emb.get_shape().as_list()[-1],
                keep_prob=config.keep_prob,
                is_train=self.is_train
            )  #input_size对应embedding的长度,此过程是初始化一个gru,双向lstm,包括他们的初始状态
            c = rnn(
                c_emb, seq_len=self.c_len
            )  #上下文编码输出为batch ,c_maxlen,以及lstm输出长度 [batch_size,sequncen_length,150*3] num_layers is 3 so concat each layers
            #each layer is 150 because each layers has back_forword and feed_forword(75+75)
            q = rnn(q_emb, seq_len=self.q_len)  #问题编码
        with tf.variable_scope("attention"):
            qc_att = dot_attention(
                c,
                q,
                mask=self.q_mask,
                hidden=d,
                keep_prob=config.keep_prob,
                is_train=self.is_train)  # 这个函数实现的是公式(4)中的所有公式
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(
                qc_att,
                seq_len=self.c_len)  # this is 公式(3) #[batch,c_maxlen,150]
        # Create a convolution + maxpool layer for each filter size
        input_shape = att.get_shape().as_list()
        print(att, "rrrr")
        att = tf.expand_dims(att, -1)
        print(att, "hhhhhhhhhhhh")
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, input_shape[-1], 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1),
                                name="W")
                l2_loss += tf.nn.l2_loss(W)
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]),
                                name="b")
                l2_loss += tf.nn.l2_loss(b)
                conv_ouput = tf.nn.conv2d(att,
                                          W,
                                          strides=[1, 1, 1, 1],
                                          padding="VALID",
                                          name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv_ouput, b), name="relu")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, config.para_limit - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat,
                                        self.dropout_keep_prob)

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            c_import_shape = c_important_emb.get_shape().as_list()
            # self.h_drop=tf.concat([self.h_drop,c_important_emb],axis=-1)
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, 3],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[3]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate mean cross-entropy loss
        if trainable:
            with tf.name_scope("loss"):
                print(self.scores, self.input_y, "llllllllllllllll")
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.scores, labels=self.input_y)
                self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions,
                                               tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
            # if config.decay is not None:
            #     self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            #     ema_op = self.var_ema.apply(tf.trainable_variables())
            #     with tf.control_dependencies([ema_op]):
            #         self.loss = tf.identity(self.loss)
            #
            #         self.assign_vars = []
            #         for var in tf.global_variables():
            #             v = self.var_ema.average(var)
            #             if v:
            #                 self.assign_vars.append(tf.assign(var, v))
            self.lr = tf.minimum(
                config.init_lr, 0.001 / tf.log(999.) *
                tf.log(tf.cast(self.global_step, tf.float32) + 1))
            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr,
                                              beta1=0.8,
                                              beta2=0.999,
                                              epsilon=1e-7)
            grads = self.opt.compute_gradients(self.loss)
            gradients, variables = zip(*grads)
            capped_grads, _ = tf.clip_by_global_norm(gradients,
                                                     config.grad_clip)
            self.train_op = self.opt.apply_gradients(
                zip(capped_grads, variables), global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
Ejemplo n.º 23
0
    def define_model(self):
        config = self.config
        N, PL, QL, d = config.batch_size * 4, self.article_maxlen, self.question_maxlen, config.hidden_size
        self.debug_output_name = []
        self.debug_output = []
        with tf.device("/cpu:0"):
            with tf.variable_scope("emb"):
                with tf.name_scope("word"):
                    c_emb = tf.nn.embedding_lookup(self.word_mat, self.article)
                    q_emb = tf.nn.embedding_lookup(self.word_mat,
                                                   self.question)

        with tf.variable_scope("encoding"):
            c, _ = stacked_gru(c_emb,
                               d,
                               batch=N,
                               num_layers=2,
                               seq_len=self.article_len,
                               keep_prob=self.keep_prob,
                               is_train=self.is_train)
            tf.get_variable_scope().reuse_variables()
            q, _ = stacked_gru(q_emb,
                               d,
                               batch=N,
                               num_layers=2,
                               seq_len=self.question_len,
                               keep_prob=self.keep_prob,
                               is_train=self.is_train)
            # c size: [batch_size, c_len, 2*d]
            # q size: [batch_size, q_len, 2*d]

        with tf.variable_scope("attention_q2d"):
            qc_att, att_weight_ = dot_attention(c,
                                                q,
                                                mask=self.question_mask,
                                                hidden=d,
                                                keep_prob=self.keep_prob,
                                                is_train=self.is_train)
            # att_weight_ : [batch_size, c_len, q_len]
            # qc_att: [batch_size, c_len, 2*2*d]

            att, _ = stacked_gru(qc_att,
                                 d,
                                 num_layers=1,
                                 seq_len=self.article_len,
                                 batch=N,
                                 keep_prob=self.keep_prob,
                                 is_train=self.is_train)

        with tf.variable_scope("match"):
            self_att, self_att_weight_ = dot_attention(
                att,
                att,
                mask=self.article_mask,
                hidden=d,
                keep_prob=self.keep_prob,
                is_train=self.is_train)
            match, _ = stacked_gru(self_att,
                                   d,
                                   num_layers=1,
                                   seq_len=self.article_len,
                                   batch=N,
                                   keep_prob=self.keep_prob,
                                   is_train=self.is_train)
            # match size: [batch_size, c_len, 2*d]

        with tf.variable_scope("sum"):
            weight_for_each_passage_word = tf.expand_dims(
                tf.reduce_sum(att_weight_, 2), 1)
            # [batch_size, 1, c_len]
            passage_representation = tf.matmul(weight_for_each_passage_word,
                                               match)
            # [batch_size, 1, 2*d] -> [batch_size, 2*d]
            weight_for_each_question_word = tf.expand_dims(
                tf.reduce_sum(att_weight_, 1), 1)
            # [batch_size, 1, q_len]
            question_representation = tf.matmul(weight_for_each_question_word,
                                                q)
            # [batch_size, 1, 2*d] -> [batch_size, 2*d]

        with tf.variable_scope("predict"):
            p_hidden = 2 * d
            q_hidden = 2 * d
            W_predict = tf.get_variable(
                "W_predict", [q_hidden, p_hidden],
                initializer=tf.truncated_normal_initializer(stddev=0.1),
                dtype=tf.float64)
            question_representation = tf.reshape(question_representation,
                                                 [-1, q_hidden])
            # [batch_size, q_hidden]
            question_representation = tf.cast(question_representation,
                                              dtype=tf.float64)
            score = tf.matmul(question_representation, W_predict)
            # [batch_size, p_hidden]
            score = tf.reshape(score, [-1, 1, p_hidden])
            # [batch_size, 1, p_hidden]
            passage_representation = tf.transpose(passage_representation,
                                                  [0, 2, 1])
            passage_representation = tf.cast(passage_representation,
                                             dtype=tf.float64)
            score = tf.matmul(score, passage_representation)
            # [batch_size, 1, 1]
            score = tf.reshape(score, [-1, 4])
            score = tf.nn.softmax(score, dim=1)
            score = tf.cast(score, dtype=tf.float32)
            self.score = tf.reshape(score, [-1])
            tf.summary.histogram('scores', self.score)
            self.loss = tf.losses.mean_squared_error(self.score, self.labels)
            tf.summary.scalar('loss_function', self.loss)
        self.debug_output_name = ['att_weight_', 'score']
        self.debug_output = [att_weight_, self.score]
Ejemplo n.º 24
0
    def ready(self):
        N, PL, QL, CL, d, dc, dg = 64, self.c_maxlen, self.q_maxlen, char_limit, hidden, char_dim, char_hidden
        gru = cudnn_gru if use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1])
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1])
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att, att, mask=self.c_mask, hidden=d)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1])
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:], d, mask=self.q_mask)
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1])
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            #outer = tf.matrix_band_part(outer, 0, 15)
            outer = tf.matrix_band_part(outer, 0, 12)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
Ejemplo n.º 25
0
    def get_vp(self, i):
        config = self.config

        gru = cudnn_gru if config.use_cudnn else native_gru
        opt = True
        MPL = config.single_para_limit

        zero = tf.constant(0)
        i_ = tf.constant(i)
        start = i * MPL
        end = (i + 1) * MPL
        c_pr = self.c_pr[:, start:end]
        ch_pr = self.ch_pr[:, start:end, :]

        # local masks
        c_mask = tf.cast(c_pr, tf.bool)
        q_mask = tf.cast(self.q, tf.bool)
        c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1)
        q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1)
        """
		### this line will replace the c_len with values 8 as it is some
		# unnecessary padding from the examples which does not have
		# passages with the same number as the max number of passage in the batch
		eight_indexes = tf.not_equal(c_len, tf.constant(8,dtype=tf.int32))
		eight_indexes = tf.cast(eight_indexes,tf.int32)
		c_len = c_len*eight_indexes
		"""

        if opt:
            N, CL = config.batch_size, config.char_limit
            c_maxlen = tf.reduce_max(c_len)
            q_maxlen = tf.reduce_max(q_len)
            c_pr = tf.slice(c_pr, [0, 0], [N, c_maxlen])
            q = tf.slice(self.q, [0, 0], [N, q_maxlen])
            c_mask = tf.slice(c_mask, [0, 0], [N, c_maxlen])
            q_mask = tf.slice(q_mask, [0, 0], [N, q_maxlen])
            ch_pr = tf.slice(ch_pr, [0, 0, 0], [N, c_maxlen, CL])
            qh = tf.slice(self.qh, [0, 0, 0], [N, q_maxlen, CL])
            y1 = tf.slice(self.y1, [0, 0], [N, c_maxlen])
            y2 = tf.slice(self.y2, [0, 0], [N, c_maxlen])

            seq_mask = tf.sequence_mask(c_len, maxlen=c_maxlen)
        else:
            self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit

        ch_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(ch_pr, tf.bool), tf.int32), axis=2),
            [-1])
        qh_len = tf.reshape(
            tf.reduce_sum(tf.cast(tf.cast(qh, tf.bool), tf.int32), axis=2),
            [-1])

        N, PL, QL, CL, d, dc, dg = config.batch_size, c_maxlen, q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, ch_pr),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, qh),
                                    [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)

                #self.cell_fw = tf.contrib.rnn.GRUCell(dg)
                #self.cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    self.cell_fw,
                    self.cell_bw,
                    ch_emb,
                    ch_len,
                    dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    self.cell_fw,
                    self.cell_bw,
                    qh_emb,
                    qh_len,
                    dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, c_pr)
                q_emb = tf.nn.embedding_lookup(self.word_mat, q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            #gru1 = lambda: gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            #	).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            #self.rnn1 = tf.cond(tf.equal(i_,zero), gru1, lambda: self.rnn1)
            #c = self.rnn1(c_emb, seq_len=c_len)
            #q = self.rnn1(q_emb, seq_len=q_len)

            if i == 0:
                self.rnn1 = gru(num_layers=3,
                                num_units=d,
                                batch_size=N,
                                input_size=c_emb.get_shape().as_list()[-1],
                                keep_prob=config.keep_prob,
                                is_train=self.is_train)
                self.q_enc = self.rnn1(q_emb, seq_len=q_len)
            c = self.rnn1(c_emb, seq_len=c_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   self.q_enc,
                                   mask=q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train,
                                   name_scope="attention_layer")

            #gru2 = lambda: gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            #	).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            #self.rnn2 = tf.cond(tf.equal(i_,zero), gru2, lambda: self.rnn2)
            #att = self.rnn2(qc_att, seq_len=c_len)

            if i == 0:
                self.rnn2 = gru(num_layers=1,
                                num_units=d,
                                batch_size=N,
                                input_size=qc_att.get_shape().as_list()[-1],
                                keep_prob=config.keep_prob,
                                is_train=self.is_train)
            att = self.rnn2(qc_att, seq_len=c_len)
        return att, c_len, c_mask, y1, y2, seq_mask
Ejemplo n.º 26
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            self.c_emb = c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            bad_c_emb = tf.stop_gradient(c_emb)
            bad_q_emb = tf.stop_gradient(q_emb)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=bad_c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            self.c_rnn = rnn(bad_c_emb, seq_len=self.c_len)
            self.q_rnn = rnn(bad_q_emb, seq_len=self.q_len)

            badptr_c = tf.stop_gradient(self.c_rnn)
            badptr_q = tf.stop_gradient(self.q_rnn)
            old_rnn = rnn

        with tf.variable_scope("badptr_attention"):
            qc_att, self.badptr_qc_att = dot_attention(
                badptr_c,
                badptr_q,
                mask=self.q_mask,
                hidden=d,
                keep_prob=config.keep_prob,
                is_train=self.is_train,
                give=True)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)

            self.att = [rnn(qc_att, seq_len=self.c_len)]
            self.att += [self.att[-1][:, -1, :]]

        with tf.variable_scope("badptr_dense"):
            for _ in range(3):
                self.att += [
                    tf.nn.dropout(tf.keras.layers.Dense(300)(self.att[-1]),
                                  keep_prob=config.keep_prob)
                ]

        with tf.variable_scope("badptr"):
            init = self.att[-1]
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, self.att[0], d, self.c_mask)

        with tf.variable_scope("badptr_predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.bad_yp1_distrib = tf.reduce_max(outer, axis=2)
            self.bad_yp2_distrib = tf.reduce_max(outer, axis=1)
            self.bad_yp1 = tf.argmax(self.bad_yp1_distrib, axis=1)
            self.bad_yp2 = tf.argmax(self.bad_yp2_distrib, axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=tf.stop_gradient(self.bad_y1))
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=tf.stop_gradient(self.bad_y2))
            self.loss = tf.reduce_mean(losses + losses2)

        # recompute c with bitmask
        left = tf.sequence_mask(self.bad_yp1, tf.shape(c_emb)[1])
        right = tf.logical_not(
            tf.sequence_mask(self.bad_yp2 + 1,
                             tf.shape(c_emb)[1]))
        self.combo = combo = tf.logical_or(left, right)

        ### FOR TESTING ###
        ## self.combo = combo = tf.cast(tf.ones_like(combo), tf.bool)

        def adjust(c_emb_combo):
            c_emb, combo = c_emb_combo
            foo = c_emb
            bar = tf.boolean_mask(foo, combo)

            return tf.cond(
                tf.logical_and(tf.equal(combo[0], False),
                               tf.equal(combo[1], True)),
                false_fn=lambda: tf.pad(
                    bar, [[0, tf.shape(foo)[0] - tf.shape(bar)[0]], [0, 0]]),
                true_fn=lambda: foo)

        self.c_emb_new = c_emb_new = tf.map_fn(adjust, (c_emb, combo),
                                               dtype=(tf.float32))
        self.c_len = tf.reduce_sum(tf.cast(
            tf.logical_and(self.c_mask, self.combo), tf.int32),
                                   axis=-1)
        self.c_mask = tf.sequence_mask(
            tf.reduce_sum(tf.cast(tf.logical_and(self.c_mask, self.combo),
                                  tf.int32),
                          axis=-1),
            tf.shape(self.c_mask)[1])

        with tf.variable_scope("encoding", reuse=True):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train,
                      super_hacky_reload=True)
            #### SEQ LEN HAS TO BE FIXED!!!! ####
            c = rnn(c_emb_new, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        self.c_ck = c
        self.q_ck = c

        ### MAKE SURE THESE ARE RUN!!! ###
        print('RUN ASSIGN TRICK OPS (model.assign_trick_ops)!!')
        self.assign_trick_ops = []
        for i in range(len(rnn.init_fw)):
            self.assign_trick_ops += [
                tf.assign(rnn.init_fw[i], old_rnn.init_fw[i])
            ]
            self.assign_trick_ops += [
                tf.assign(rnn.init_bw[i], old_rnn.init_bw[i])
            ]

        with tf.variable_scope("attention"):
            qc_att, self.qc_att = dot_attention(c,
                                                q,
                                                mask=self.q_mask,
                                                hidden=d,
                                                keep_prob=config.keep_prob,
                                                is_train=self.is_train,
                                                give=True)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        self.att_ck = att

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        self.match_ck = match

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1_distrib = tf.reduce_max(outer, axis=2)
            self.yp2_distrib = tf.reduce_max(outer, axis=1)
            self.yp1 = tf.argmax(self.yp1_distrib, axis=1)
            self.yp2 = tf.argmax(self.yp2_distrib, axis=1)
Ejemplo n.º 27
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                _, qh_emb = stacked_gru(qh_emb,
                                        dg,
                                        num_layers=1,
                                        seq_len=self.qh_len,
                                        keep_prob=self.keep_prob,
                                        is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                _, ch_emb = stacked_gru(ch_emb,
                                        dg,
                                        num_layers=1,
                                        seq_len=self.ch_len,
                                        keep_prob=self.keep_prob,
                                        is_train=self.is_train)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            c, _ = stacked_gru(c_emb,
                               d,
                               batch=N,
                               num_layers=3,
                               seq_len=self.c_len,
                               keep_prob=self.keep_prob,
                               is_train=self.is_train)
            tf.get_variable_scope().reuse_variables()
            q, _ = stacked_gru(q_emb,
                               d,
                               batch=N,
                               num_layers=3,
                               seq_len=self.q_len,
                               keep_prob=self.keep_prob,
                               is_train=self.is_train)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=self.keep_prob,
                                   is_train=self.is_train)
            att, _ = stacked_gru(qc_att,
                                 d,
                                 num_layers=1,
                                 seq_len=self.c_len,
                                 keep_prob=self.keep_prob,
                                 is_train=self.is_train)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=self.keep_prob,
                                     is_train=self.is_train)
            match, _ = stacked_gru(self_att,
                                   d,
                                   num_layers=1,
                                   seq_len=self.c_len,
                                   keep_prob=self.keep_prob,
                                   is_train=self.is_train)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=self.ptr_keep_prob,
                        is_train=self.is_train)
            d_match = dropout(match,
                              keep_prob=self.ptr_keep_prob,
                              is_train=self.is_train)
            hidden = init.get_shape().as_list()[-1]
            cell_fw = GRUCell(hidden)
            cell_bw = GRUCell(hidden)
            with tf.variable_scope("fw"):
                inp, logits1_fw = pointer(d_match, init, d, mask=self.c_mask)
                _, state = cell_fw(inp, init)
                tf.get_variable_scope().reuse_variables()
                _, logits2_fw = pointer(d_match, state, d, mask=self.c_mask)
            with tf.variable_scope("bw"):
                inp, logits2_bw = pointer(d_match, init, d, mask=self.c_mask)
                _, state = cell_bw(inp, init)
                tf.get_variable_scope().reuse_variables()
                _, logits1_bw = pointer(d_match, state, d, mask=self.c_mask)
            logits1 = (logits1_fw + logits1_bw) / 2.
            logits2 = (logits2_fw + logits2_bw) / 2.

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)