Beispiel #1
0
    def encode(self, inputs, masks, is_train):
        context, question = inputs
        context_mask, question_mask = masks

        with tf.variable_scope("encode_context"):
            # outshape: [batch_size, 2 * rnn_hidden_units]
            lstm_pool_context, lstm_out_context = BiLSTM(
                context,
                context_mask,
                self.hidden_units,
                tf.cond(is_train, lambda: self.output_dropout_keep_prob,
                        lambda: 1.0),
                tf.cond(is_train, lambda: self.input_dropout_keep_prob,
                        lambda: 1.0),
                tf.cond(is_train, lambda: self.state_dropout_keep_prob,
                        lambda: 1.0),
                n_layers=self.n_layers,
                residual=False,
                use_last=True,
                seed=self.seed,
                reuse=False)
            lstm_out_context = tf.concat(
                [lstm_out_context[0], lstm_out_context[1]],
                2,
                name='lstm_out_context')

        with tf.variable_scope('encode_question'):
            lstm_pool_question, lstm_out_question = BiLSTM(
                question,
                question_mask,
                self.hidden_units,
                tf.cond(is_train, lambda: self.output_dropout_keep_prob,
                        lambda: 1.0),
                tf.cond(is_train, lambda: self.input_dropout_keep_prob,
                        lambda: 1.0),
                tf.cond(is_train, lambda: self.state_dropout_keep_prob,
                        lambda: 1.0),
                n_layers=self.n_layers,
                residual=False,
                use_last=True,
                seed=self.seed,
                reuse=False)
            lstm_out_question = tf.concat(
                [lstm_out_question[0], lstm_out_question[1]],
                2,
                name='lstm_out_question')

        return [lstm_out_context,
                lstm_pool_context], [lstm_out_question, lstm_pool_question]
Beispiel #2
0
 def run_match_lstm(self, context_out, question_out, context_len,
                    question_len, is_train):
     qc_att = scaled_dot_product_attention(context_out,
                                           question_out,
                                           memory_len=question_len,
                                           hidden=self.hidden_units,
                                           keep_prob=self.keep_prob,
                                           is_train=is_train)
     lstm_out = BiLSTM(
         qc_att,
         context_len,
         self.hidden_units,
         output_dropout_keep_prob=tf.cond(is_train,
                                          lambda: self.output_keep_prob,
                                          lambda: 1.0),
         input_dropout_keep_prob=tf.cond(is_train,
                                         lambda: self.input_keep_prob,
                                         lambda: 1.0),
         state_dropout_keep_prob=tf.cond(is_train,
                                         lambda: self.state_keep_prob,
                                         lambda: 1.0),
         use_last=False,
         seed=self.seed,
         reuse=False)
     lstm_out = tf.concat([lstm_out[0], lstm_out[1]], 2, name='lstm_out')
     return lstm_out
Beispiel #3
0
  def _gen_left_right_ctx(self):
    self.layers={}
    self.attention_dims=50
    self.rnn_size=150
    self.layers['BiLSTM'] = BiLSTM(self.rnn_size)
    self.layers['att_weights'] = {
    'h_m':tf.Variable(tf.truncated_normal([self.args.word_dim,self.attention_dims],stddev=0.01)),
    'h1': tf.Variable(tf.truncated_normal([2*self.rnn_size,self.attention_dims],stddev=0.01)),
    'h2': tf.Variable(tf.truncated_normal([self.attention_dims,1],stddev=0.01)),
    }


    self.right_feature,_,_=self.layers['BiLSTM'](self.ment_sent_right_ctx_embed)
    self.left_feature,_,_=self.layers['BiLSTM'](self.ment_sent_left_ctx_embed)


    lstm_feature = tf.concat([self.right_feature,self.left_feature],1)

    att_w_m = tf.einsum('aij,jk->aik',tf.expand_dims(self.ment_surface_feature,1),self.layers['att_weights']['h_m'])

    att_w1 = tf.nn.tanh(tf.einsum('aij,jk->aik',lstm_feature,self.layers['att_weights']['h1'])+att_w_m)
    self.att_w2 = tf.nn.softmax(tf.einsum('aij,jk->aik',att_w1,self.layers['att_weights']['h2'])[:,:,0],-1)

    att_w = tf.tile(tf.expand_dims(self.att_w2,-1),[1,1,2*self.rnn_size])

    lstm_feature = tf.reduce_sum(tf.multiply(lstm_feature , att_w),1)

    lstm_feature = tf.nn.dropout(lstm_feature,self.keep_prob)
    print('lstm_feature:',lstm_feature)
    return lstm_feature
Beispiel #4
0
    def run_lstm(self, context_out, question_pool, context_len, is_train):
        # tile pooled question rep and concat with context
        q_rep = tf.expand_dims(question_pool, 1)  # (batch_size, 1, D)
        encoded_passage_shape = tf.shape(context_out)[1]
        q_rep = tf.tile(q_rep, [1, encoded_passage_shape, 1])

        q_c_rep = tf.concat([context_out, q_rep], axis=-1)

        with tf.variable_scope('lstm_') as scope:
            lstm_out = BiLSTM(q_c_rep,
                              context_len,
                              self.hidden_units,
                              tf.cond(is_train, lambda: self.output_keep_prob,
                                      lambda: 1.0),
                              tf.cond(is_train, lambda: self.input_keep_prob,
                                      lambda: 1.0),
                              tf.cond(is_train, lambda: self.state_keep_prob,
                                      lambda: 1.0),
                              use_last=False,
                              seed=self.seed,
                              reuse=False)
            lstm_out = tf.concat([lstm_out[0], lstm_out[1]],
                                 2,
                                 name='lstm_out')
        return lstm_out
Beispiel #5
0
    def __init__(self, model, **kwargs):
        self.pc = model.add_subcollection()
        self.kwargs = kwargs

        basename = kwargs.get("basename")
        index = read_index(basename)
        self._num_labels = len(index[DEPREL])

        lstm_num_layers = kwargs.get("lstm_num_layers", 2)
        lstm_dim = kwargs.get("lstm_dim", 250)
        self.embeddings = Embeddings.init_from_word2vec(self.pc,
                                                        basename,
                                                        FIELDS,
                                                        index=index)
        input_dim = self.embeddings.dim
        self.lstm = BiLSTM(self.pc, input_dim, lstm_dim, lstm_num_layers)

        self.spec = kwargs,
Beispiel #6
0
    def main_graph(self,
                   trained_model,
                   scope,
                   emb_dim,
                   cell,
                   rnn_dim,
                   rnn_num,
                   drop_out=0.5,
                   emb=None):
        if trained_model is not None:
            param_dic = {
                'nums_chars': self.nums_chars,
                'nums_tags': self.nums_tags,
                'crf': self.crf,
                'emb_dim': emb_dim,
                'cell': cell,
                'rnn_dim': rnn_dim,
                'rnn_num': rnn_num,
                'drop_out': drop_out,
                'buckets_char': self.buckets_char,
                'ngram': self.ngram,
                'is_space': self.is_space,
                'sent_seg': self.sent_seg,
                'emb_path': self.emb_path,
                'tag_scheme': self.tag_scheme
            }
            #print param_dic
            f_model = open(trained_model, 'w')
            pickle.dump(param_dic, f_model)
            f_model.close()

        # define shared weights and variables
        batch_size_h = tf.placeholder(tf.int32, [], name='batch_size_holder')
        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.batch_size_h = batch_size_h
        self.drop_out = dr
        self.drop_out_v = drop_out
        # pdb.set_trace()
        self.emb_layer = EmbeddingLayer(self.nums_chars + 20,
                                        emb_dim,
                                        weights=emb,
                                        name='emb_layer')

        if self.ngram is not None:
            ng_embs = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(
                    EmbeddingLayer(n_gram + 5000 * (i + 2),
                                   emb_dim,
                                   weights=ng_embs[i],
                                   name=str(i + 2) + 'gram_layer'))

        with tf.variable_scope('BiRNN'):

            if cell == 'gru':
                fw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim)  #forward
                bw_rnn_cell = tf.contrib.rnn.GRUCell(rnn_dim)  #backward
            else:
                fw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim,
                                                      state_is_tuple=True)
                bw_rnn_cell = tf.contrib.rnn.LSTMCell(rnn_dim,
                                                      state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.contrib.rnn.MultiRNNCell([fw_rnn_cell] *
                                                          rnn_num,
                                                          state_is_tuple=True)
                bw_rnn_cell = tf.contrib.rnn.MultiRNNCell([bw_rnn_cell] *
                                                          rnn_num,
                                                          state_is_tuple=True)

        output_wrapper = HiddenLayer(rnn_dim * 2,
                                     self.nums_tags,
                                     activation='linear',
                                     name='hidden')

        #define model for each bucket
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                scope.reuse_variables()
            t1 = time()
            batch_size = self.real_batches[idx]

            input_v1 = tf.placeholder(tf.int32, [None, bucket],
                                      name='input_1' + str(bucket))
            input_v2 = tf.placeholder(tf.int32, [None, bucket],
                                      name='input_2' + str(bucket))
            self.input_v1.append([input_v1])
            self.input_v2.append([input_v2])
            #output = None
            output = []
            for i in range(self.num_gpus):
                with tf.device('/gpu:{}'.format(i)):
                    input_1 = input_v1[i * batch_size_h:(i + 1) * batch_size_h]

                    input_2 = input_v2[i * batch_size_h:(i + 1) * batch_size_h]

                    emb_set1 = []
                    emb_set2 = []

                    word_out1 = self.emb_layer(input_1)
                    word_out2 = self.emb_layer(input_2)
                    emb_set1.append(word_out1)
                    emb_set2.append(word_out2)

                    # if self.ngram is not None:
                    # 	for i in range(len(self.ngram)):
                    # 		input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    # 		self.input_v[-1].append(input_g)
                    # 		gram_out = self.gram_layers[i](input_g)
                    # 		emb_set.append(gram_out)

                    if len(emb_set1) > 1:
                        emb_out1 = tf.concat(axis=2, values=emb_set1)
                        emb_out2 = tf.concat(axis=2, values=emb_set2)

                    else:
                        emb_out1 = emb_set1[0]
                        emb_out2 = emb_set2[0]

                    emb_out1 = DropoutLayer(dr)(emb_out1)
                    emb_out2 = DropoutLayer(dr)(emb_out2)

                    rnn_out = BiLSTM(rnn_dim,
                                     fw_cell=fw_rnn_cell,
                                     bw_cell=bw_rnn_cell,
                                     p=dr,
                                     name='BiLSTM' + str(bucket),
                                     scope='BiRNN')(emb_out1, emb_out2,
                                                    input_v1)

                    output_g = output_wrapper(rnn_out)
                    # if output == None:
                    # output = output_g
                    # else:
                    # output = tf.concat([output,output_g],axis = 0)
                    #pdb.set_trace()
                    output.append(output_g)
            self.output.append([output])

            self.output_.append([
                tf.placeholder(tf.int32, [None, bucket - 1],
                               name='tags' + str(bucket))
            ])
            self.bucket_dit[bucket] = idx

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert len(self.input_v1) == len(self.output)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
Beispiel #7
0
    max_epochs = 30
    lstm_dim = 250
    arc_hidden_dim = 100
    label_hidden_dim = 100

    pc = dy.ParameterCollection()
    # embeddings = Embeddings(pc, [(len(index[FORM])+1, 100), (len(index[XPOS])+1, 25)])
    # input_dim = embeddings.dim

    input_dim = 125
    num_labels = len(index[DEPREL])

    form_embeddings = pc.add_lookup_parameters((len(index[FORM]) + 1, 100))
    pos_embeddings = pc.add_lookup_parameters((len(index[XPOS]) + 1, 25))

    bilstm = BiLSTM(pc, input_dim, lstm_dim, 2)

    arc_mlp = MLP()
    label_mlp = MLP()

    arc_mlp.WH = pc.add_parameters((arc_hidden_dim, lstm_dim))
    arc_mlp.WD = pc.add_parameters((arc_hidden_dim, lstm_dim))
    arc_mlp.b1 = pc.add_parameters((arc_hidden_dim))
    arc_mlp.b2 = pc.add_parameters((1, arc_hidden_dim))

    label_mlp.WH = pc.add_parameters((label_hidden_dim, lstm_dim))
    label_mlp.WD = pc.add_parameters((label_hidden_dim, lstm_dim))
    label_mlp.b1 = pc.add_parameters((label_hidden_dim))
    label_mlp.b2 = pc.add_parameters((num_labels, label_hidden_dim))

    def predict_arc(head, dep, h, WH, WD):
Beispiel #8
0
    def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None, ng_embs=None, pixels=None, con_width=None, filters=None, pooling_size=None):
        if trained_model is not None:
            param_dic = {}
            param_dic['nums_chars'] = self.nums_chars
            param_dic['nums_tags'] = self.nums_tags
            param_dic['tag_scheme'] = self.tag_scheme
            param_dic['graphic'] = self.graphic
            param_dic['pic_size'] = self.pic_size
            param_dic['word_vec'] = self.word_vec
            param_dic['radical'] = self.radical
            param_dic['crf'] = self.crf
            param_dic['emb_dim'] = emb_dim
            param_dic['gru'] = gru
            param_dic['rnn_dim'] = rnn_dim
            param_dic['rnn_num'] = rnn_num
            param_dic['drop_out'] = drop_out
            param_dic['filter_size'] = con_width
            param_dic['filters'] = filters
            param_dic['pooling_size'] = pooling_size
            param_dic['font'] = self.font
            param_dic['buckets_char'] = self.buckets_char
            param_dic['ngram'] = self.ngram
            #print param_dic
            f_model = open(trained_model, 'w')
            pickle.dump(param_dic, f_model)
            f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        if self.word_vec:
            self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer')

        if self.radical:
            self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer')

        if self.ngram is not None:
            if ng_embs is not None:
                assert len(ng_embs) == len(self.ngram)
            else:
                ng_embs = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer'))

        wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None, None, None, None

        if self.graphic:
            self.input_p = []
            assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None

            self.pixels = pixels
            pixel_dim = int(math.sqrt(len(pixels[0])))

            wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1')
            wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1')

            p_size_1 = toolbox.down_pool(pixel_dim, pooling_size)

            wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2')
            wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2')

            p_size_2 = toolbox.down_pool(p_size_1, pooling_size)

            wrapper_dense = TimeDistributed(HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3')
            wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr')

        with tf.variable_scope('BiRNN'):

            if gru:
                fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
            else:
                fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True)

        output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'), name='wrapper')

        #define model for each bucket
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                scope.reuse_variables()
            t1 = time()

            input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket))

            self.input_v.append([input_v])

            emb_set = []

            if self.word_vec:
                word_out = self.emb_layer(input_v)
                emb_set.append(word_out)

            if self.radical:
                input_r = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket))

                self.input_v[-1].append(input_r)
                radical_out = self.radical_layer(input_r)
                emb_set.append(radical_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if self.graphic:
                input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim*pixel_dim])
                self.input_p.append(input_p)

                pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1])
                pix_out = tf.unpack(pix_out, axis=1)

                conv_out_1 = wrapper_conv_1(pix_out)
                pooling_out_1 = wrapper_mp_1(conv_out_1)

                conv_out_2 = wrapper_conv_2(pooling_out_1)
                pooling_out_2 = wrapper_mp_2(conv_out_2)

                assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1]
                pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters])
                pooling_out = tf.unpack(pooling_out, axis=1)

                graphic_out = wrapper_dense(pooling_out)
                graphic_out = wrapper_dr(graphic_out)

                emb_set.append(graphic_out)


            if len(emb_set) > 1:
                emb_out = tf.concat(2, emb_set)
                emb_out = tf.unpack(emb_out)

            else:
                emb_out = emb_set[0]

            rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v)

            output = output_wrapper(rnn_out)

            output_c = tf.pack(output, axis=1)

            self.output.append([output_c])

            self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))])

            self.bucket_dit[bucket] = idx

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
Beispiel #9
0
    def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, emb=None):
        if trained_model is not None:
            param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'crf': self.crf, 'emb_dim': emb_dim,
                         'gru': gru, 'rnn_dim': rnn_dim, 'rnn_num': rnn_num, 'drop_out': drop_out, 'buckets_char': self.buckets_char,
                         'ngram': self.ngram, 'is_space': self.is_space, 'sent_seg': self.sent_seg, 'emb_path': self.emb_path,
                         'tag_scheme': self.tag_scheme}
            #print param_dic
            f_model = open(trained_model, 'w')
            pickle.dump(param_dic, f_model)
            f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        self.emb_layer = EmbeddingLayer(self.nums_chars + 20, emb_dim, weights=emb, name='emb_layer')

        if self.ngram is not None:
            ng_embs = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(EmbeddingLayer(n_gram + 5000 * (i + 2), emb_dim, weights=ng_embs[i], name= str(i + 2) + 'gram_layer'))

        with tf.variable_scope('BiRNN'):

            if gru:
                fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
            else:
                fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell]*rnn_num, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell]*rnn_num, state_is_tuple=True)

        output_wrapper = TimeDistributed(HiddenLayer(rnn_dim * 2, self.nums_tags, activation='linear', name='hidden'), name='wrapper')

        #define model for each bucket
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                scope.reuse_variables()
            t1 = time()

            input_v = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket))

            self.input_v.append([input_v])

            emb_set = []

            word_out = self.emb_layer(input_v)
            emb_set.append(word_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if len(emb_set) > 1:
                emb_out = tf.concat(2, emb_set)

            else:
                emb_out = emb_set[0]

            emb_out = DropoutLayer(dr)(emb_out)
            emb_out = tf.unpack(emb_out)

            rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr, name='BiLSTM' + str(bucket), scope='BiRNN')(emb_out, input_v)

            output = output_wrapper(rnn_out)
            output_c = tf.pack(output, axis=1)

            self.output.append([output_c])

            self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))])
            self.bucket_dit[bucket] = idx

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) and len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
    def build_model(self):
        with tf.variable_scope("Input_Embedding_Layer"):
            with tf.variable_scope("Char_Embedding_Layer"):
                # char embedding
                ## Lookup
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.contc_input),
                    [-1, self.char_limit, self.char_dim])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
                    [-1, self.char_limit, self.char_dim])
                ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb)
                qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb)

                ## BiLSTM (weight-shared ??)
                ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb],
                                        self.char_filters // 2,
                                        dropout=self.dropout_rnn,
                                        name='char_lstm')
                ch_emb = tf.reduce_max(ch_emb, axis=1)
                qh_emb = tf.reduce_max(qh_emb, axis=1)
                ch_emb = tf.reshape(ch_emb,
                                    [-1, self.c_maxlen, self.char_filters])
                qh_emb = tf.reshape(qh_emb,
                                    [-1, self.q_maxlen, self.char_filters])

            with tf.variable_scope("Word_Embedding_Layer"):
                # word embedding
                ## Lookup
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input)
                c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb)
                q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb)

            # cove features
            ## word embedding을 (c_emb, q_emp) -> CoVe를 사용해 embedding (c_emp, q_emp)
            if self.use_cove != 0:
                if self.use_cove == 2:
                    self.cove_cont = tf.stop_gradient(
                        self.cove_model(c_emb))  # [bs, c_len, 2, 600]
                    self.cove_ques = tf.stop_gradient(
                        self.cove_model(q_emb))  # [bs, q_len, 2, 600]
                with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE):
                    cove_context_input = CoveCombineLayer(
                        self.cove_cont, 'input')
                    cove_question_input = CoveCombineLayer(
                        self.cove_ques, 'input')
                c_emb = tf.concat([c_emb, cove_context_input], axis=-1)
                q_emb = tf.concat([q_emb, cove_question_input], axis=-1)

            # elmo features
            ## word embedding을 (c_emb, q_emp) -> Elmo 사용해 embedding (c_emp, q_emp)
            if self.use_elmo != 0:
                with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE):
                    elmo_context_input = ElmoCombineLayer(
                        self.elmo_cont, 'input')
                    elmo_question_input = ElmoCombineLayer(
                        self.elmo_ques, 'input')
                    elmo_context_output = ElmoCombineLayer(
                        self.elmo_cont, 'output')
                    elmo_question_output = ElmoCombineLayer(
                        self.elmo_ques, 'output')
                c_emb = tf.concat([c_emb, elmo_context_input], axis=-1)
                q_emb = tf.concat([q_emb, elmo_question_input], axis=-1)

            if self.use_feat:
                c_emb = tf.concat(
                    [c_emb, self.cont_feat], axis=-1
                )  ## concat [context_pos, context_ner, context_match]
                q_emb = tf.concat(
                    [q_emb, self.ques_feat],
                    axis=-1)  ## concat [ques_pos, ques_ner, ques_match]

            # combine embedding feats
            ## concat word_embedding, char_embedding
            c_emb = tf.concat([c_emb, ch_emb], axis=-1)
            q_emb = tf.concat([q_emb, qh_emb], axis=-1)

            # BiLSTM Embedding (weight-shared ??)
            with tf.variable_scope("BiLSTM_Embedding_Layer"):
                c_emb, q_emb = BiLSTM([c_emb, q_emb],
                                      self.filters // 2,
                                      dropout=self.dropout_rnn,
                                      name='encoder')

        with tf.variable_scope("Iterative_Reattention_Aligner"):
            self.Lambda = tf.get_variable('Lambda',
                                          dtype=tf.float32,
                                          initializer=self.init_lambda)
            with tf.variable_scope("Aligning_Block1"):
                R, Z1, E, B = align_block(u=c_emb,
                                          v=q_emb,
                                          c_mask=self.c_mask,
                                          q_mask=self.q_mask,
                                          Lambda=self.Lambda,
                                          filters=self.filters,
                                          dropout=self.dropout_rnn)
                R = tf.nn.dropout(R, 1.0 - self.dropout_att)
            with tf.variable_scope("Aligning_Block2"):
                R, Z2, E, B = align_block(u=R,
                                          v=q_emb,
                                          c_mask=self.c_mask,
                                          q_mask=self.q_mask,
                                          E_0=E,
                                          B_0=B,
                                          Lambda=self.Lambda,
                                          filters=self.filters,
                                          dropout=self.dropout_rnn)
                R = tf.nn.dropout(R, 1.0 - self.dropout_att)
            with tf.variable_scope("Aligning_Block3"):
                R, Z3, E, B = align_block(u=R,
                                          v=q_emb,
                                          c_mask=self.c_mask,
                                          q_mask=self.q_mask,
                                          E_0=E,
                                          B_0=B,
                                          Z_0=[Z1, Z2],
                                          Lambda=self.Lambda,
                                          filters=self.filters,
                                          dropout=self.dropout_rnn)
                R = tf.nn.dropout(R, 1.0 - self.dropout_att)

        with tf.variable_scope("Answer_Pointer"):
            # logits
            if self.use_elmo != 0:
                elmo_output_feats = ElmoAttention(
                    [elmo_context_output, elmo_question_output], self.c_maxlen,
                    self.q_maxlen, self.q_mask, self.dropout)
                R = tf.concat([R, elmo_output_feats], axis=-1)
            s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask)
            s = tf.nn.dropout(s, 1 - self.dropout)
            logits1 = start_logits(R,
                                   s,
                                   mask=self.c_mask,
                                   filters=self.filters,
                                   name='Start_Pointer')  # [bs, c_len]
            logits2 = end_logits(R,
                                 logits1,
                                 s,
                                 mask=self.c_mask,
                                 filters=self.filters,
                                 name='End_Pointer')  # [bs, c_len]

        with tf.variable_scope("Loss_Layer"):
            # maximum-likelihood (ML) loss for dataset V2.0
            start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)

            # l2 loss
            if self.l2_norm is not None:
                decay_costs = []
                for var in tf.trainable_variables():
                    decay_costs.append(tf.nn.l2_loss(var))
                self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs))

            # RL loss
            if self.use_rlloss:
                with tf.variable_scope("Reinforcement_Loss"):
                    self.rl_loss, _, _ = rl_loss(logits1, logits2,
                                                 self.y_start, self.y_end,
                                                 self.c_maxlen)
                    self.loss += (self.rlw * self.rl_loss)

        with tf.variable_scope('Output_Layer'):
            softmax_start_scores = tf.nn.softmax(logits1)
            softmax_end_scores = tf.nn.softmax(logits2)

            outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2),
                              tf.expand_dims(softmax_end_scores, axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)

            def position_encoding(x):
                import math
                for i in range(x.shape[0]):
                    for j in range(x.shape[1]):
                        if j - i > 5:
                            x[i][j] = float(1.0 / math.log(j - i + 1))
                return x

            mask_mat = tf.ones((self.c_maxlen, self.c_maxlen))
            mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat],
                                                 tf.float32),
                                      axis=0)
            mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1])

            outer_masked = outer * mask_mat
            self.mask_output1 = tf.argmax(tf.reduce_max(outer_masked, axis=2),
                                          axis=1)
            self.mask_output2 = tf.argmax(tf.reduce_max(outer_masked, axis=1),
                                          axis=1)
Beispiel #11
0
class MSTParser(object):
    def __init__(self, model, **kwargs):
        self.pc = model.add_subcollection()
        self.kwargs = kwargs

        basename = kwargs.get("basename")
        index = read_index(basename)
        self._num_labels = len(index[DEPREL])

        lstm_num_layers = kwargs.get("lstm_num_layers", 2)
        lstm_dim = kwargs.get("lstm_dim", 250)
        self.embeddings = Embeddings.init_from_word2vec(self.pc,
                                                        basename,
                                                        FIELDS,
                                                        index=index)
        input_dim = self.embeddings.dim
        self.lstm = BiLSTM(self.pc, input_dim, lstm_dim, lstm_num_layers)

        self.spec = kwargs,

    def transduce(self, feats):
        x = self.embeddings(feats)
        h = self.lstm(x)
        return h

    @abstractmethod
    def _predict_arc(self, head, dep, h):
        raise NotImplementedError()

    def predict_arcs(self, h):
        num_nodes = len(h)

        def _predict_heads(dep):
            scores = [
                self._predict_arc(head, dep, h) if head != dep else dy.zeros(1)
                for head in range(num_nodes)
            ]
            return dy.concatenate(scores)

        heads = [_predict_heads(dep) for dep in range(1, num_nodes)]
        return heads

    @abstractmethod
    def _predict_labels(self, head, dep, h):
        raise NotImplementedError()

    def predict_labels(self, heads, h):
        num_nodes = len(h)
        labels = [
            self._predict_labels(heads[dep - 1], dep, h)
            for dep in range(1, num_nodes)
        ]
        return labels

    def _parse_heads(self, heads, h):
        scores = self.predict_arcs(h)
        weights = np.transpose(
            np.vstack([np.zeros(len(h))] + [s.npvalue() for s in scores]))
        parse_nonprojective(weights, heads)

    def _parse_labels(self, heads, labels, h):
        scores = self.predict_labels(heads, h)
        labels[:] = [
            np.argmax(scores[i].npvalue()) + 1 for i in range(len(scores))
        ]

    def parse(self, feats):
        dy.renew_cg()
        x = self.embeddings(feats)
        h = self.lstm(x)
        tree = DepTree(len(x))
        self._parse_heads(tree.heads, h)
        self._parse_labels(tree.heads, tree.labels, h)
        return tree

    def disable_dropout(self):
        self.embeddings.disable_dropout()
        self.lstm.disable_dropout()

    def enable_dropout(self):
        self.embeddings.set_dropout(self.kwargs.get("input_dropout", 0))
        self.lstm.set_dropout(self.kwargs.get("lstm_dropout", 0))

    def param_collection(self):
        return self.pc

    __metaclass__ = ABCMeta
Beispiel #12
0
    def main_graph(self,
                   trained_model,
                   scope,
                   emb_dim,
                   gru,
                   rnn_dim,
                   rnn_num,
                   fnn_dim,
                   window_size,
                   drop_out=0.5,
                   rad_dim=30,
                   emb=None,
                   ng_embs=None,
                   pixels=None,
                   con_width=None,
                   filters=None,
                   pooling_size=None):
        if trained_model is not None:
            param_dic = {}
            param_dic['nums_chars'] = self.nums_chars
            param_dic['nums_tags'] = self.nums_tags
            param_dic['tag_scheme'] = self.tag_scheme
            param_dic['graphic'] = self.graphic
            param_dic['pic_size'] = self.pic_size
            param_dic['word_vec'] = self.word_vec
            param_dic['radical'] = self.radical
            param_dic['crf'] = self.crf
            param_dic['emb_dim'] = emb_dim
            param_dic['gru'] = gru
            param_dic['rnn_dim'] = rnn_dim
            param_dic['rnn_num'] = rnn_num
            param_dic['fnn_dim'] = fnn_dim
            param_dic['window_size'] = window_size
            param_dic['drop_out'] = drop_out
            param_dic['filter_size'] = con_width
            param_dic['filters'] = filters
            param_dic['pooling_size'] = pooling_size
            param_dic['font'] = self.font
            param_dic['buckets_char'] = self.buckets_char
            param_dic['ngram'] = self.ngram
            param_dic['mode'] = self.mode
            #print param_dic
            if self.metric == 'All':
                pindex = trained_model.rindex('/') + 1
                for m in self.all_metrics:
                    f_model = open(
                        trained_model[:pindex] + m + '_' +
                        trained_model[pindex:], 'w')
                    pickle.dump(param_dic, f_model)
                    f_model.close()
            else:
                f_model = open(trained_model, 'w')
                pickle.dump(param_dic, f_model)
                f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        #concat_emb_dim = emb_dim * 2
        concat_emb_dim = 0

        if self.word_vec:
            self.emb_layer = EmbeddingLayer(self.nums_chars + 500,
                                            emb_dim,
                                            weights=emb,
                                            name='emb_layer')
            concat_emb_dim += emb_dim

        if self.radical:
            self.radical_layer = EmbeddingLayer(216,
                                                rad_dim,
                                                name='radical_layer')
            concat_emb_dim += rad_dim

        if self.ngram is not None:
            if ng_embs is not None:
                assert len(ng_embs) == len(self.ngram)
            else:
                ng_embs = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(
                    EmbeddingLayer(n_gram + 1000 * (i + 2),
                                   emb_dim,
                                   weights=ng_embs[i],
                                   name=str(i + 2) + 'gram_layer'))
                concat_emb_dim += emb_dim

        wrapper_conv_1, wrapper_mp_1, wrapper_conv_2 = None, None, None
        wrapper_mp_2, wrapper_dense, wrapper_dr = None, None, None

        if self.graphic:
            self.input_p = []
            assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None

            self.pixels = pixels
            pixel_dim = int(math.sqrt(len(pixels[0])))

            wrapper_conv_1 = Convolution(con_width, 1, filters, name='conv_1')
            wrapper_mp_1 = Maxpooling(pooling_size,
                                      pooling_size,
                                      name='pooling_1')

            p_size_1 = toolbox.down_pool(pixel_dim, pooling_size)

            wrapper_conv_2 = Convolution(con_width,
                                         filters,
                                         filters,
                                         name='conv_2')
            wrapper_mp_2 = Maxpooling(pooling_size,
                                      pooling_size,
                                      name='pooling_2')
            p_size_2 = toolbox.down_pool(p_size_1, pooling_size)

            wrapper_dense = HiddenLayer(p_size_2 * p_size_2 * filters,
                                        100,
                                        activation='tanh',
                                        name='conv_dense')
            wrapper_dr = DropoutLayer(self.drop_out)

            concat_emb_dim += 100

        fw_rnn_cell, bw_rnn_cell = None, None

        if self.mode == 'RNN':
            with tf.variable_scope('BiRNN'):

                if gru:
                    fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                    bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                else:
                    fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim,
                                                          state_is_tuple=True)
                    bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim,
                                                          state_is_tuple=True)

                if rnn_num > 1:
                    fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(
                        [fw_rnn_cell] * rnn_num, state_is_tuple=True)
                    bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(
                        [bw_rnn_cell] * rnn_num, state_is_tuple=True)

            output_wrapper = HiddenLayer(rnn_dim * 2,
                                         self.nums_tags[0],
                                         activation='linear',
                                         name='out_wrapper')
            fnn_weights, fnn_bias = None, None

        else:

            with tf.variable_scope('FNN'):
                fnn_weights = tf.get_variable(
                    'conv_w',
                    [2 * window_size + 1, concat_emb_dim, 1, fnn_dim])
                fnn_bias = tf.get_variable(
                    'conv_b', [fnn_dim],
                    initializer=tf.constant_initializer(0.1))

            output_wrapper = HiddenLayer(fnn_dim,
                                         self.nums_tags[0],
                                         activation='linear',
                                         name='out_wrapper')

        #define model for each bucket
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                scope.reuse_variables()
            t1 = time()

            input_v = tf.placeholder(tf.int32, [None, bucket],
                                     name='input_' + str(bucket))

            self.input_v.append([input_v])

            emb_set = []

            if self.word_vec:
                word_out = self.emb_layer(input_v)
                emb_set.append(word_out)

            if self.radical:
                input_r = tf.placeholder(tf.int32, [None, bucket],
                                         name='input_r' + str(bucket))

                self.input_v[-1].append(input_r)
                radical_out = self.radical_layer(input_r)
                emb_set.append(radical_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket],
                                             name='input_g' + str(i) +
                                             str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if self.graphic:
                input_p = tf.placeholder(tf.float32,
                                         [None, bucket, pixel_dim * pixel_dim])
                self.input_p.append(input_p)
                pix_out = tf.reshape(input_p, [-1, pixel_dim, pixel_dim, 1])

                conv_out_1 = wrapper_conv_1(pix_out)
                pooling_out_1 = wrapper_mp_1(conv_out_1)

                conv_out_2 = wrapper_conv_2(pooling_out_1)
                pooling_out_2 = wrapper_mp_2(conv_out_2)

                assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1]

                pooling_out = tf.reshape(
                    pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters])

                graphic_out = wrapper_dense(pooling_out)
                graphic_out = wrapper_dr(graphic_out)

                emb_set.append(graphic_out)

            if len(emb_set) > 1:
                emb_out = tf.concat(axis=2, values=emb_set)

            else:
                emb_out = emb_set[0]

            if self.mode == 'RNN':
                rnn_out = BiLSTM(rnn_dim,
                                 fw_cell=fw_rnn_cell,
                                 bw_cell=bw_rnn_cell,
                                 p=dr,
                                 name='BiLSTM' + str(bucket),
                                 scope='BiRNN')(emb_out, input_v)

                output = output_wrapper(rnn_out)

            else:
                emb_out = tf.pad(emb_out,
                                 [[0, 0], [window_size, window_size], [0, 0]])
                emb_out = tf.reshape(
                    emb_out, [-1, bucket + 2 * window_size, concat_emb_dim, 1])
                conv_out = tf.nn.conv2d(emb_out,
                                        fnn_weights, [1, 1, 1, 1],
                                        padding='VALID') + fnn_bias
                fnn_out = tf.nn.tanh(conv_out)
                fnn_out = tf.reshape(fnn_out, [-1, bucket, fnn_dim])

                output = output_wrapper(fnn_out)

            self.output.append([output])

            self.output_.append([
                tf.placeholder(tf.int32, [None, bucket],
                               name='tags' + str(bucket))
            ])

            self.bucket_dit[bucket] = idx

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert len(self.input_v) == len(self.output) and len(self.output) == len(self.output_) \
               and len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
Beispiel #13
0
    def main_graph(self, trained_model, scope, emb_dim, gru, rnn_dim, rnn_num, drop_out=0.5, rad_dim=30, emb=None,
                   ngram_embedding=None, pixels=None, con_width=None, filters=None, pooling_size=None):
        """

        :param trained_model:
        :param scope:
        :param emb_dim:
        :param gru:
        :param rnn_dim:
        :param rnn_num:
        :param drop_out:
        :param rad_dim: n
        :param emb:
        :param ngram_embedding: 预训练 ngram embeddig 文件
        :param pixels:
        :param con_width:
        :param filters:
        :param pooling_size:
        :return:
        """
        # trained_model: 模型存储路径
        if trained_model is not None:
            param_dic = {'nums_chars': self.nums_chars, 'nums_tags': self.nums_tags, 'tag_scheme': self.tag_scheme,
                         'graphic': self.graphic, 'pic_size': self.pic_size, 'word_vec': self.word_vec,
                         'radical': self.radical, 'crf': self.crf, 'emb_dim': emb_dim, 'gru': gru, 'rnn_dim': rnn_dim,
                         'rnn_num': rnn_num, 'drop_out': drop_out, 'filter_size': con_width, 'filters': filters,
                         'pooling_size': pooling_size, 'font': self.font, 'buckets_char': self.buckets_char,
                         'ngram': self.ngram}
            print "RNN dimension is %d" % rnn_dim
            print "RNN number is %d" % rnn_num
            print "Character embedding size is %d" % emb_dim
            print "Ngram embedding dimension is %d" % emb_dim
            # 存储模型超参数
            if self.metric == 'All':
                # rindex() 返回子字符串 str 在字符串中最后出现的位置
                # 截取模型文件名
                pindex = trained_model.rindex('/') + 1
                for m in self.all_metrics:
                    f_model = open(trained_model[:pindex] + m + '_' + trained_model[pindex:], 'w')
                    pickle.dump(param_dic, f_model)
                    f_model.close()
            else:
                f_model = open(trained_model, 'w')
                pickle.dump(param_dic, f_model)
                f_model.close()

        # define shared weights and variables

        dr = tf.placeholder(tf.float32, [], name='drop_out_holder')
        self.drop_out = dr
        self.drop_out_v = drop_out

        # 字向量层
        # 为什么字符数要加 500 ?
        # emb_dim 是每个字符的特征向量维度,可以通过命令行参数设置
        # weights 表示预训练的字向量,可以通过命令行参数设置
        if self.word_vec:
            self.emb_layer = EmbeddingLayer(self.nums_chars + 500, emb_dim, weights=emb, name='emb_layer')

        # 偏旁部首向量
        # 依照《康熙字典》,共有 214 个偏旁部首。
        # 只用了常见汉字的偏旁部首,非常见汉字和非汉字的偏旁部首用其他两个特殊符号代替,
        # 所以共有 216 个偏旁部首
        if self.radical:
            self.radical_layer = EmbeddingLayer(216, rad_dim, name='radical_layer')

        if self.ngram is not None:
            if ngram_embedding is not None:
                assert len(ngram_embedding) == len(self.ngram)
            else:
                ngram_embedding = [None for _ in range(len(self.ngram))]
            for i, n_gram in enumerate(self.ngram):
                self.gram_layers.append(EmbeddingLayer(n_gram + 1000 * (i + 2), emb_dim, weights=ngram_embedding[i],
                                                       name=str(i + 2) + 'gram_layer'))

        wrapper_conv_1, wrapper_mp_1, wrapper_conv_2, wrapper_mp_2, wrapper_dense, wrapper_dr = \
            None, None, None, None, None, None

        if self.graphic:
            # 使用图像信息,需要用到 CNN
            self.input_p = []
            assert pixels is not None and filters is not None and pooling_size is not None and con_width is not None

            self.pixels = pixels
            pixel_dim = int(math.sqrt(len(pixels[0])))

            wrapper_conv_1 = TimeDistributed(Convolution(con_width, 1, filters, name='conv_1'), name='wrapper_c1')
            wrapper_mp_1 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_1'), name='wrapper_p1')

            p_size_1 = toolbox.down_pool(pixel_dim, pooling_size)

            wrapper_conv_2 = TimeDistributed(Convolution(con_width, filters, filters, name='conv_2'), name='wrapper_c2')
            wrapper_mp_2 = TimeDistributed(Maxpooling(pooling_size, pooling_size, name='pooling_2'), name='wrapper_p2')

            p_size_2 = toolbox.down_pool(p_size_1, pooling_size)

            wrapper_dense = TimeDistributed(
                HiddenLayer(p_size_2 * p_size_2 * filters, 100, activation='tanh', name='conv_dense'), name='wrapper_3')
            wrapper_dr = TimeDistributed(DropoutLayer(self.drop_out), name='wrapper_dr')

        with tf.variable_scope('BiRNN'):

            if gru:
                fw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
                bw_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_dim)
            else:
                fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(rnn_dim, state_is_tuple=True)

            if rnn_num > 1:
                fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([fw_rnn_cell] * rnn_num, state_is_tuple=True)
                bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([bw_rnn_cell] * rnn_num, state_is_tuple=True)

        # 隐藏层,输入是前向 RNN 的输出加上 后向 RNN 的输出,所以输入维度为 rnn_dim * 2
        # 输出维度即标签个数
        output_wrapper = TimeDistributed(
            HiddenLayer(rnn_dim * 2, self.nums_tags[0], activation='linear', name='hidden'),
            name='wrapper')

        # define model for each bucket
        # 每一个 bucket 中的句子长度不一样,所以需要定义单独的模型
        # bucket: bucket 中的句子长度
        for idx, bucket in enumerate(self.buckets_char):
            if idx == 1:
                # scope 是 tf.variable_scope("tagger", reuse=None, initializer=initializer)
                # 只需要设置一次 reuse,后面就都 reuse 了
                scope.reuse_variables()
            t1 = time()

            # 输入的句子,one-hot 向量
            # shape = (batch_size, 句子长度)
            input_sentences = tf.placeholder(tf.int32, [None, bucket], name='input_' + str(bucket))

            self.input_v.append([input_sentences])

            emb_set = []

            if self.word_vec:
                # 根据 one-hot 向量查找对应的字向量
                # word_out: shape=(batch_size, 句子长度,字向量维度(64))
                word_out = self.emb_layer(input_sentences)
                emb_set.append(word_out)

            if self.radical:
                # 嵌入偏旁部首信息,shape = (batch_size, 句子长度)
                input_radicals = tf.placeholder(tf.int32, [None, bucket], name='input_r' + str(bucket))

                self.input_v[-1].append(input_radicals)
                radical_out = self.radical_layer(input_radicals)
                emb_set.append(radical_out)

            if self.ngram is not None:
                for i in range(len(self.ngram)):
                    input_g = tf.placeholder(tf.int32, [None, bucket], name='input_g' + str(i) + str(bucket))
                    self.input_v[-1].append(input_g)
                    gram_out = self.gram_layers[i](input_g)
                    emb_set.append(gram_out)

            if self.graphic:
                input_p = tf.placeholder(tf.float32, [None, bucket, pixel_dim * pixel_dim])
                self.input_p.append(input_p)

                pix_out = tf.reshape(input_p, [-1, bucket, pixel_dim, pixel_dim, 1])

                conv_out_1 = wrapper_conv_1(pix_out)
                pooling_out_1 = wrapper_mp_1(conv_out_1)

                conv_out_2 = wrapper_conv_2(pooling_out_1)
                pooling_out_2 = wrapper_mp_2(conv_out_2)

                assert p_size_2 == pooling_out_2[0].get_shape().as_list()[1]
                pooling_out = tf.reshape(pooling_out_2, [-1, bucket, p_size_2 * p_size_2 * filters])
                pooling_out = tf.unstack(pooling_out, axis=1)

                graphic_out = wrapper_dense(pooling_out)
                graphic_out = wrapper_dr(graphic_out)

                emb_set.append(graphic_out)

            if self.window_size > 1:

                padding_size = int(np.floor(self.window_size / 2))
                word_padded = tf.pad(word_out, [[0, 0], [padding_size, padding_size], [0, 0]], 'CONSTANT')

                Ws = []
                for q in range(1, self.window_size + 1):
                    Ws.append(tf.get_variable("W_%d" % q, shape=[q * emb_dim, self.filters_number]))
                b = tf.get_variable("b", shape=[self.filters_number])

                z = [None for _ in range(0, bucket)]

                for q in range(1, self.window_size + 1):
                    for i in range(padding_size, bucket + padding_size):
                        low = i - int(np.floor((q - 1) / 2))
                        high = i + int(np.ceil((q + 1) / 2))
                        x = word_padded[:, low, :]
                        for j in range(low + 1, high):
                            x = tf.concat(values=[x, word_padded[:, j, :]], axis=1)
                        z_iq = tf.tanh(tf.nn.xw_plus_b(x, Ws[q - 1], b))
                        if z[i - padding_size] is None:
                            z[i - padding_size] = z_iq
                        else:
                            z[i - padding_size] = tf.concat([z[i - padding_size], z_iq], axis=1)

                z = tf.stack(z, axis=1)
                values, indices = tf.nn.top_k(z, sorted=False, k=emb_dim)

                # highway layer
                X = tf.unstack(word_out, axis=1)
                Conv_X = tf.unstack(values, axis=1)
                X_hat = []
                W_t = tf.get_variable("W_t", shape=[emb_dim, emb_dim])
                b_t = tf.get_variable("b_t", shape=[emb_dim])
                for x, conv_x in zip(X, Conv_X):
                    T_x = tf.sigmoid(tf.nn.xw_plus_b(x, W_t, b_t))
                    X_hat.append(tf.multiply(conv_x, T_x) + tf.multiply(x, 1 - T_x))
                X_hat = tf.stack(X_hat, axis=1)
                emb_set.append(X_hat)
            if len(emb_set) > 1:
                # 各种字向量直接 concat 起来(字向量、偏旁部首、n-gram、图像信息等)
                emb_out = tf.concat(axis=2, values=emb_set)

            else:
                emb_out = emb_set[0]

            # rnn_out 是前向 RNN 的输出和后向 RNN 的输出 concat 之后的值
            rnn_out = BiLSTM(rnn_dim, fw_cell=fw_rnn_cell, bw_cell=bw_rnn_cell, p=dr,
                             name='BiLSTM' + str(bucket), scope='BiRNN')(self.highway(emb_out, "tag"), input_sentences)

            # 应用全连接层,Wx+b 得到最后的输出
            output = output_wrapper(rnn_out)
            # 为什么要 [output] 而不是 output 呢?
            self.output.append([output])

            self.output_.append([tf.placeholder(tf.int32, [None, bucket], name='tags' + str(bucket))])

            self.bucket_dit[bucket] = idx

            # language model
            lm_rnn_dim = rnn_dim
            with tf.variable_scope('LM-BiRNN'):
                if gru:
                    lm_fw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim)
                    lm_bw_rnn_cell = tf.nn.rnn_cell.GRUCell(lm_rnn_dim)
                else:
                    lm_fw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True)
                    lm_bw_rnn_cell = tf.nn.rnn_cell.LSTMCell(lm_rnn_dim, state_is_tuple=True)

                if rnn_num > 1:
                    lm_fw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_fw_rnn_cell] * rnn_num, state_is_tuple=True)
                    lm_bw_rnn_cell = tf.nn.rnn_cell.MultiRNNCell([lm_bw_rnn_cell] * rnn_num, state_is_tuple=True)
            lm_rnn_output = BiLSTM(lm_rnn_dim, fw_cell=lm_fw_rnn_cell,
                                   bw_cell=lm_bw_rnn_cell, p=dr,
                                   name='LM-BiLSTM' + str(bucket),
                                   scope='LM-BiRNN')(self.highway(emb_set[0]), input_sentences)

            lm_output_wrapper = TimeDistributed(
                HiddenLayer(lm_rnn_dim * 2, self.nums_chars + 2, activation='linear', name='lm_hidden'),
                name='lm_wrapper')
            lm_final_output = lm_output_wrapper(lm_rnn_output)
            self.lm_predictions.append([lm_final_output])
            self.lm_groundtruthes.append([tf.placeholder(tf.int32, [None, bucket], name='lm_targets' + str(bucket))])

            print 'Bucket %d, %f seconds' % (idx + 1, time() - t1)

        assert \
            len(self.input_v) == len(self.output) and \
            len(self.output) == len(self.output_) and \
            len(self.lm_predictions) == len(self.lm_groundtruthes) and \
            len(self.output) == len(self.counts)

        self.params = tf.trainable_variables()

        self.saver = tf.train.Saver()
    def build_model(self):
        with tf.variable_scope("Input_Embedding_Layer"):
            with tf.variable_scope("Char_Embedding_Layer"):
                # char embedding
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.contc_input),
                    [-1, self.char_limit, self.char_dim])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.quesc_input),
                    [-1, self.char_limit, self.char_dim])
                ch_emb = tf.nn.dropout(ch_emb, 1 - self.dropout_emb)
                qh_emb = tf.nn.dropout(qh_emb, 1 - self.dropout_emb)

                ch_emb, qh_emb = BiLSTM([ch_emb, qh_emb],
                                        self.char_dim // 2,
                                        dropout=self.dropout_rnn,
                                        name='char_lstm',
                                        return_state=True)
                ch_emb = tf.reshape(ch_emb, [-1, self.c_maxlen, self.char_dim])
                qh_emb = tf.reshape(qh_emb, [-1, self.q_maxlen, self.char_dim])

            with tf.variable_scope("Word_Embedding_Layer"):
                # word embedding
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.contw_input)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.quesw_input)
                c_emb = tf.nn.dropout(c_emb, 1.0 - self.dropout_emb)
                q_emb = tf.nn.dropout(q_emb, 1.0 - self.dropout_emb)

            # cove features
            if self.use_cove != 0:
                if self.use_cove == 2:
                    self.cove_cont = tf.stop_gradient(
                        self.cove_model(c_emb))  # [bs, c_len, 2, 600]
                    self.cove_ques = tf.stop_gradient(
                        self.cove_model(q_emb))  # [bs, q_len, 2, 600]
                with tf.variable_scope('Cove_weights', reuse=tf.AUTO_REUSE):
                    cove_context_input = CoveCombineLayer(
                        self.cove_cont, 'input')
                    cove_question_input = CoveCombineLayer(
                        self.cove_ques, 'input')
                c_emb = tf.concat([c_emb, cove_context_input], axis=-1)
                q_emb = tf.concat([q_emb, cove_question_input], axis=-1)

            # elmo features
            if self.use_elmo != 0:
                with tf.variable_scope('ELMo_weights', reuse=tf.AUTO_REUSE):
                    elmo_context_input = ElmoCombineLayer(
                        self.elmo_cont, 'input')
                    elmo_question_input = ElmoCombineLayer(
                        self.elmo_ques, 'input')
                    elmo_context_output = ElmoCombineLayer(
                        self.elmo_cont, 'output')
                    elmo_question_output = ElmoCombineLayer(
                        self.elmo_ques, 'output')
                c_emb = tf.concat([c_emb, elmo_context_input], axis=-1)
                q_emb = tf.concat([q_emb, elmo_question_input], axis=-1)

            if self.use_feat:
                c_emb = tf.concat([c_emb, self.cont_feat], axis=-1)
                q_emb = tf.concat([q_emb, self.ques_feat], axis=-1)

            # combine embedding feats
            c_emb = tf.concat([c_emb, ch_emb], axis=-1)
            q_emb = tf.concat([q_emb, qh_emb], axis=-1)

            # BiLSTM Embedding
            with tf.variable_scope("BiLSTM_Embedding_Layer"):
                c_emb, q_emb = BiLSTM([c_emb, q_emb],
                                      self.filters // 2,
                                      dropout=self.dropout_rnn,
                                      name='encoder')

        with tf.variable_scope("Iterative_Reattention_Aligner"):
            self.Lambda = tf.get_variable('Lambda',
                                          dtype=tf.float32,
                                          initializer=self.init_lambda)
            with tf.variable_scope("Aligning_Block1"):
                R, Z1, E, B = align_block(u=c_emb,
                                          v=q_emb,
                                          c_mask=self.c_mask,
                                          q_mask=self.q_mask,
                                          Lambda=self.Lambda,
                                          filters=self.filters,
                                          dropout=self.dropout_rnn)
                R = tf.nn.dropout(R, 1.0 - self.dropout_att)
            with tf.variable_scope("Aligning_Block2"):
                R, Z2, E, B = align_block(u=R,
                                          v=q_emb,
                                          c_mask=self.c_mask,
                                          q_mask=self.q_mask,
                                          E_0=E,
                                          B_0=B,
                                          Lambda=self.Lambda,
                                          filters=self.filters,
                                          dropout=self.dropout_rnn)
                R = tf.nn.dropout(R, 1.0 - self.dropout_att)
            with tf.variable_scope("Aligning_Block3"):
                R, Z3, E, B = align_block(u=R,
                                          v=q_emb,
                                          c_mask=self.c_mask,
                                          q_mask=self.q_mask,
                                          E_0=E,
                                          B_0=B,
                                          Z_0=[Z1, Z2],
                                          Lambda=self.Lambda,
                                          filters=self.filters,
                                          dropout=self.dropout_rnn)
                R = tf.nn.dropout(R, 1.0 - self.dropout_att)

        with tf.variable_scope("Answer_Pointer"):
            # logits
            if self.use_elmo != 0:
                elmo_output_feats = ElmoAttention(
                    [elmo_context_output, elmo_question_output], self.c_maxlen,
                    self.q_maxlen, self.q_mask, self.dropout)
                R = tf.concat([R, elmo_output_feats], axis=-1)
            s = summary_vector(q_emb, self.c_maxlen, mask=self.q_mask)
            s = tf.nn.dropout(s, 1 - self.dropout)
            logits1 = start_logits(R,
                                   s,
                                   mask=self.c_mask,
                                   filters=self.filters,
                                   name='Start_Pointer')  # [bs, c_len]
            logits2 = end_logits(R,
                                 logits1,
                                 s,
                                 mask=self.c_mask,
                                 filters=self.filters,
                                 name='End_Pointer')  # [bs, c_len]
            self.unanswer_bias = tf.get_variable(
                "unanswer_bias", [1], initializer=tf.zeros_initializer())
            self.unanswer_bias = tf.reshape(
                tf.tile(self.unanswer_bias, [self.un_size]), [-1, 1])
            logits1 = tf.concat((self.unanswer_bias, logits1), axis=-1)
            logits2 = tf.concat((self.unanswer_bias, logits2), axis=-1)

            logits1p = start_logits(R,
                                    s,
                                    mask=self.c_mask,
                                    filters=self.filters,
                                    name='Start_Pointer2')  # [bs, c_len]
            logits2p = end_logits(R,
                                  logits1p,
                                  s,
                                  mask=self.c_mask,
                                  filters=self.filters,
                                  name='End_Pointer2')  # [bs, c_len]

        with tf.variable_scope("Loss_Layer"):
            # maximum-likelihood (ML) loss for dataset V2.0
            # loss a
            start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1, labels=self.y_start)
            end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2, labels=self.y_end)
            self.loss = tf.reduce_mean(start_loss + end_loss)

            # loss b
            pstart_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits1p, labels=self.yp_start)
            pend_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits2p, labels=self.yp_end)
            self.loss += self.gamma_b * tf.reduce_mean(pstart_loss + pend_loss)

            # loss c
            answer_exist_label = tf.cast(
                tf.slice(self.y_start, [0, 0], [-1, 1]), tf.float32)
            self.loss += self.gamma_c * tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=self.unanswer_bias, labels=answer_exist_label))

            # l2 loss
            if self.l2_norm is not None:
                decay_costs = []
                for var in tf.trainable_variables():
                    decay_costs.append(tf.nn.l2_loss(var))
                self.loss += tf.multiply(self.l2_norm, tf.add_n(decay_costs))

            # RL loss
            if self.use_rlloss:
                with tf.variable_scope("Reinforcement_Loss"):
                    self.rl_loss_a, _, _ = rl_loss(logits1, logits2,
                                                   self.y_start, self.y_end,
                                                   self.c_maxlen + 1)
                    self.rl_loss_b, _, _ = rl_loss(logits1p, logits2p,
                                                   self.yp_start, self.yp_end,
                                                   self.c_maxlen)
                    self.loss += (
                        self.rlw *
                        (self.rl_loss_a + self.gamma_b * self.rl_loss_b))

        with tf.variable_scope('Output_Layer'):
            softmax_start_scores = tf.nn.softmax(
                tf.slice(logits1, [0, 1], [-1, -1]))
            softmax_end_scores = tf.nn.softmax(
                tf.slice(logits2, [0, 1], [-1, -1]))

            unanswer_mask1 = tf.cast(
                tf.argmax(tf.nn.softmax(logits1), axis=-1), tf.int64)
            unanswer_mask1 = tf.cast(
                tf.cast(unanswer_mask1,
                        tf.bool), tf.int64)  # [bs,] has answer=1 no answer=0
            unanswer_move1 = unanswer_mask1 - 1  # [bs,] has answer=0 no answer=-1
            unanswer_mask2 = tf.cast(
                tf.argmax(tf.nn.softmax(logits2), axis=-1), tf.int64)
            unanswer_mask2 = tf.cast(tf.cast(unanswer_mask2, tf.bool),
                                     tf.int64)  # [bs,]
            unanswer_move2 = unanswer_mask2 - 1

            softmax_start_p = tf.nn.softmax(logits2p)
            softmax_end_p = tf.nn.softmax(logits2p)
            softmax_start_scores = (
                1 - self.gamma_b
            ) * softmax_start_scores + self.gamma_b * softmax_start_p
            softmax_end_scores = (
                1 - self.gamma_b
            ) * softmax_end_scores + self.gamma_b * softmax_end_p

            outer = tf.matmul(tf.expand_dims(softmax_start_scores, axis=2),
                              tf.expand_dims(softmax_end_scores, axis=1))
            outer = tf.matrix_band_part(outer, 0, self.ans_limit)

            def position_encoding(x):
                import math
                for i in range(x.shape[0]):
                    for j in range(x.shape[1]):
                        if j - i > 5:
                            x[i][j] = float(1.0 / math.log(j - i + 1))
                return x

            mask_mat = tf.ones((self.c_maxlen, self.c_maxlen))
            mask_mat = tf.expand_dims(tf.py_func(position_encoding, [mask_mat],
                                                 tf.float32),
                                      axis=0)
            mask_mat = tf.tile(mask_mat, [self.un_size, 1, 1])

            outer_masked = outer * mask_mat
            self.mask_output1 = tf.argmax(
                tf.reduce_max(outer_masked, axis=2),
                axis=1) * unanswer_mask1 + unanswer_move1
            self.mask_output2 = tf.argmax(
                tf.reduce_max(outer_masked, axis=1),
                axis=1) * unanswer_mask2 + unanswer_move2