Ejemplo n.º 1
0
    def biLSTM_layer(self, lstm_inputs,lstm_dim,lengths,num_layers,keep_prob=1.):
        """
        :param lstm_inputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, 2*lstm_dim]
        """
        batch_size=shape(lstm_inputs,0)
        with tf.variable_scope("BiLSTM"):
            for layer in range(num_layers):
              with tf.variable_scope("layer_{}".format(layer)):
                with tf.variable_scope("forward"):
                    cell_fw = CustomLSTMCell(lstm_dim,batch_size,keep_prob)
                with tf.variable_scope("backward"):
                    cell_bw = CustomLSTMCell(lstm_dim,batch_size,keep_prob)
                state_fw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_fw.initial_state.c, [batch_size, 1]),
                                                        tf.tile(cell_fw.initial_state.h, [batch_size, 1]))
                state_bw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_bw.initial_state.c, [batch_size, 1]),
                                                        tf.tile(cell_bw.initial_state.h, [batch_size, 1]))

                (fw_outputs, bw_outputs), _ = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=lstm_inputs,
                    sequence_length=lengths,
                    initial_state_fw=state_fw,
                    initial_state_bw=state_bw
                    )
                text_outputs = tf.concat([fw_outputs, bw_outputs], 2) # [num_sentences, max_sentence_length, emb]
                text_outputs = tf.nn.dropout(text_outputs,keep_prob)
                if layer > 0:
                  highway_gates = tf.sigmoid(projection(text_outputs,shape(text_outputs, 2))) # [num_sentences, max_sentence_length, emb]
                  text_outputs = highway_gates * text_outputs + (1 - highway_gates) * lstm_inputs
                lstm_inputs=text_outputs
                
            return lstm_inputs
Ejemplo n.º 2
0
def _shift(BD):  #选元素
    """
    convert:
        -3 -2 -1 0 1 2
        -3 -2 -1 0 1 2
        -3 -2 -1 0 1 2
    to:
        0   1  2
        -1  0  1
        -2 -1  0
    """
    bsz = shape(BD, 0)
    n_head = shape(BD, 1)
    max_len = shape(BD, 2)
    zero_pad = tf.zeros(shape=(bsz, n_head, max_len, 1))
    BD = tf.reshape(tf.concat([BD, zero_pad], axis=-1),
                    shape=(bsz, n_head, -1, max_len))
    BD = tf.reshape(BD[:, :, :-1], shape=(bsz, n_head, max_len, -1))
    BD = BD[:, :, :, max_len:]
    return BD
Ejemplo n.º 3
0
    def project_layer(self, lstm_outputs):
        """
        hidden layer between lstm layer and logits
        :param lstm_outputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, num_tags]
        """
        num_steps = shape(lstm_outputs, 1)
        hidden_size = shape(lstm_outputs, 2)
        with tf.variable_scope("project"):
            output = tf.reshape(lstm_outputs, shape=[-1, hidden_size])
            with tf.variable_scope("logits"):
                W = tf.get_variable("W",
                                    shape=[hidden_size, self.num_tags],
                                    dtype=tf.float32,
                                    initializer=self.initializer)

                b = tf.get_variable("b",
                                    shape=[self.num_tags],
                                    dtype=tf.float32,
                                    initializer=tf.zeros_initializer())

                pred = tf.nn.xw_plus_b(output, W, b)

            return tf.reshape(pred, [-1, num_steps, self.num_tags])
Ejemplo n.º 4
0
    def adapting_transformer_layer(self,
                                   batch_input,
                                   mask,
                                   ffnn_size,
                                   num_heads=8,
                                   attn_blocks_num=2,
                                   attention_keep_prob=1.0,
                                   ffnn_keep_prob=1.0):

        attn_outs = batch_input
        attention_size = shape(attn_outs, -1)
        local_bias = get_local_bias(shape(attn_outs, 1), 1)
        for block_id in range(attn_blocks_num):
            with tf.variable_scope("num_blocks_{}".format(block_id)):
                if block_id == 0:
                    attn_outs = local_multi_head_attention(
                        attn_outs,
                        mask,
                        attention_size,
                        num_heads,
                        attention_keep_prob,
                        reuse=False,
                        local_bias=local_bias)
                else:
                    attn_outs = relative_multi_head_attention(
                        attn_outs,
                        mask,
                        attention_size,
                        num_heads,
                        attention_keep_prob,
                        reuse=False)
                attn_outs = feedforward(attn_outs, [ffnn_size, attention_size],
                                        ffnn_keep_prob,
                                        reuse=False)

        return attn_outs
Ejemplo n.º 5
0
def relative_multi_head_attention(x,
                                  mask,
                                  attention_size,
                                  num_heads,
                                  drop_keep_rate=1.0,
                                  reuse=None):
    # borrowed from: https://github.com/Kyubyong/transformer/blob/master/modules.py
    with tf.variable_scope("relative_multi_head_attention", reuse=reuse):
        # attention size must consistent with queries(keys)'s -1 dim
        batch_size = shape(x, 0)
        # attention_size = x.get_shape().as_list()[-1]
        max_time = shape(x, 1)

        pos_embed = relative_positional_encoding(max_time,
                                                 attention_size // num_heads,
                                                 True)

        # linear projections, shape=(batch_size, max_time, attention_size)
        query = tf.layers.dense(
            x,
            attention_size,
            use_bias=False,
            name="query_project",
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        # query = tf.layers.dense(x, attention_size, activation=tf.nn.relu, name="query_project",
        #                         kernel_initializer=tf.contrib.layers.xavier_initializer())
        # key do not dense in this model
        key = x
        # value = tf.layers.dense(x, attention_size, activation=tf.nn.relu, name="value_project",
        #                         kernel_initializer=tf.contrib.layers.xavier_initializer())
        value = tf.layers.dense(
            x,
            attention_size,
            use_bias=False,
            name="value_project",
            kernel_initializer=tf.contrib.layers.xavier_initializer())

        # split and concatenation, shape=(batch_size, num_heads, max_time, attention_size / num_heads)
        query_ = tf.stack(tf.split(query, num_heads, axis=2), axis=1)
        key_ = tf.stack(tf.split(key, num_heads, axis=2), axis=1)
        # value_ = tf.concat(tf.split(value, num_heads, axis=2), axis=0)
        value_ = tf.stack(tf.split(value, num_heads, axis=2), axis=1)
        # shape =(num_heads, attention_size / num_heads)
        u = tf.get_variable('var_u',
                            shape=[num_heads, attention_size // num_heads],
                            initializer=tf.glorot_normal_initializer())
        v = tf.get_variable('var_v',
                            shape=[num_heads, attention_size // num_heads],
                            initializer=tf.glorot_normal_initializer())

        Qu = query_ + u[:, None]
        QKuK = tf.einsum('bnqd,bnkd->bnqk', Qu, key_)

        vR = tf.einsum('nd,ld->nl', v, pos_embed)[None, :, None]
        QR = tf.einsum('bnqd,ld->bnql', query_, pos_embed)
        QRvR = QR + vR
        QRvR = _shift(QRvR)  #

        attn_outs = QKuK + QRvR  #[batch_size,num_heads,max_time,max_time]
        # attn_outs = tf.reshape(attn_outs, shape=(batch_size*num_heads, max_time, max_time))
        # attn_outs = tf.concat(tf.unstack(attn_outs, axis=1), axis=0)
        # activation
        #apply talking heads before softmax
        # pre_softmax_weight=tf.get_variable('pre_softmax_weight',shape=[num_heads,num_heads],initializer=tf.glorot_normal_initializer())
        # attn_outs=tf.einsum("BNFT,NL->BLFT", attn_outs,pre_softmax_weight)
        #print(mask)
        ret = (1.0 - tf.cast(mask, tf.float32)) * -1e9
        bias = tf.expand_dims(tf.expand_dims(ret, 1), 1)
        attn_outs += bias
        attn_outs = tf.nn.softmax(attn_outs)
        #apply talking heads after softmax
        # post_softmax_weight=tf.get_variable('post_softmax_weight',shape=[num_heads,num_heads],initializer=tf.glorot_normal_initializer())
        # attn_outs=tf.einsum("BNFT,NL->BLFT", attn_outs,post_softmax_weight)
        # dropout
        attn_outs = tf.nn.dropout(attn_outs, drop_keep_rate)
        # attn_outs = tf.concat(tf.unstack(attn_outs, axis=1), axis=0)
        # weighted sum
        outputs = tf.matmul(attn_outs, value_)
        # restore shape
        # outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)
        outputs = _combine_heads(outputs)
        # outputs = tf.layers.dense(outputs,attention_size,use_bias=False,name="output_project",
        #                         kernel_initializer=tf.contrib.layers.xavier_initializer())
        # residual connection
        outputs += x
        outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6,
                                                     dtype="float32")(outputs)

    return outputs
Ejemplo n.º 6
0
    def __init__(self, config, data):

        self.config = config
        self.data = data
        self.num_tags = data.label_alphabet_size
        self.gaz_emb_dim = data.gaz_emb_dim
        self.word_emb_dim = data.word_emb_dim
        self.biword_emb_dim = data.biword_emb_dim
        # 参数初始化
        self.initializer = initializers.xavier_initializer()
        # add placeholders for the model
        self.is_train = tf.placeholder(dtype=tf.bool,
                                       shape=[],
                                       name='is_train')

        self.word_inputs = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="word_inputs")

        self.biword_inputs = tf.placeholder(dtype=tf.int32,
                                            shape=[None, None],
                                            name='biword_inputs')

        self.mask = tf.placeholder(dtype=tf.int32,
                                   shape=[None, None],
                                   name='mask')

        self.word_seq_lengths = tf.placeholder(dtype=tf.int32,
                                               shape=[None],
                                               name="word_seq_lengths")

        self.batch_label = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="batch_label")

        self.layer_gaz = tf.placeholder(dtype=tf.int32,
                                        shape=[None, None, 4, None],
                                        name="layer_gaz")
        self.gaz_mask_input = tf.placeholder(dtype=tf.int32,
                                             shape=[None, None, 4, None],
                                             name="gaz_mask_input")

        self.gaz_count = tf.placeholder(dtype=tf.int32,
                                        shape=[None, None, 4, None],
                                        name="gaz_count")

        self.embedding_keep_prob = self.get_keep_rate(
            self.config['embedding_dropout'], self.is_train)
        self.fc_keep_prob = self.get_keep_rate(self.config['fc_dropout'],
                                               self.is_train)
        self.attention_keep_prob = self.get_keep_rate(
            self.config['attention_dropout'], self.is_train)
        self.ffnn_keep_prob = self.get_keep_rate(self.config['ffnn_dropout'],
                                                 self.is_train)

        batch_size = shape(self.word_inputs, 0)
        seq_len = shape(self.word_inputs, 1)

        #word
        word_embs = self.word_embedding_layer(self.word_inputs,
                                              data.word_alphabet.size(),
                                              self.word_emb_dim)
        word_embs = word_embs * tf.expand_dims(
            tf.cast(self.mask, dtype=tf.float32), -1)
        #biword
        biword_embs = self.biword_embedding_layer(self.biword_inputs,
                                                  data.biword_alphabet.size(),
                                                  self.biword_emb_dim)

        biword_embs = biword_embs * tf.expand_dims(
            tf.cast(self.mask, dtype=tf.float32), -1)

        word_inputs_d = tf.concat([word_embs, biword_embs], -1)
        word_inputs_d = tf.nn.dropout(word_inputs_d, self.embedding_keep_prob)
        #gaz
        gaz_embeds = self.gaz_embedding_layer(self.layer_gaz,
                                              data.gaz_alphabet.size(),
                                              self.gaz_emb_dim)
        gaz_embeds = tf.nn.dropout(gaz_embeds, self.embedding_keep_prob)
        gaz_embeds = gaz_embeds * (1.0 - tf.expand_dims(
            tf.cast(self.gaz_mask_input, dtype=tf.float32), -1))  #
        #dropout

        count_sum = tf.reduce_sum(self.gaz_count, 3,
                                  keepdims=True)  #(b,l,4,gn)每个位置的单词总数
        count_sum = tf.reduce_sum(count_sum, 2,
                                  keepdims=True)  #(b,l,1,1)#4个位置也要算?

        weights = tf.divide(self.gaz_count,
                            count_sum)  #(b,l,4,g)tf.int32/tf.int32=tf.float64
        weights = weights * 4
        weights = tf.cast(tf.expand_dims(weights, -1), tf.float32)
        gaz_embeds = weights * gaz_embeds  #(b,l,4,g,e)
        gaz_embeds = tf.reduce_sum(gaz_embeds, 3)  #(b,l,4,e)

        gaz_embeds_cat = tf.reshape(
            gaz_embeds,
            (batch_size, seq_len, 4 * self.gaz_emb_dim))  #(b,l,4*ge) l=length

        word_input_cat = tf.concat([word_inputs_d, gaz_embeds_cat],
                                   -1)  #(b,l,we+4*ge)

        # inputs = tf.layers.dense(word_input_cat,self.config['attention_size'], name='input_fc',
        #                          kernel_initializer=self.initializer)

        #on-lstm
        inputs = self.onLSTM_layer(word_input_cat,
                                   self.config['attention_size'],
                                   self.word_seq_lengths, 1, 16)
        outputs = self.adapting_transformer_layer(
            inputs, self.mask, self.config['ffnn_size'],
            self.config['num_heads'], self.config['attn_blocks_num'],
            self.attention_keep_prob, self.ffnn_keep_prob)
        # fc_dropout
        outputs = tf.nn.dropout(outputs, self.fc_keep_prob)
        # 分类
        self.logits = self.project_layer(outputs)
        # crf计算损失
        self.loss, self.trans = self.loss_layer(self.logits,
                                                self.word_seq_lengths)

        num_train_steps = math.ceil(
            self.config['train_examples_len'] /
            self.config["batch_size"]) * self.config["epochs"]
        num_warmup_steps = int(num_train_steps *
                               self.config['warmup_proportion'])
        self.global_step = tf.train.get_or_create_global_step()
        trainable_params = tf.trainable_variables()
        for var in trainable_params:
            print(" trainable_params name = %s, shape = %s" %
                  (var.name, var.shape))

        self.train_op = optimization.create_optimizer(
            trainable_params, self.loss, self.config['other_learning_rate'],
            self.config['crf_learning_rate'], num_train_steps,
            num_warmup_steps, self.global_step)
        # saver of the model
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
Ejemplo n.º 7
0
    def get_predictions(self,input_ids,input_mask,input_lens,segment_ids,a_input_ids,a_labels_ids,a_input_mask,a_input_lens,a_segment_ids,is_train):

        self.keep_prob = self.get_keep_rate(self.config['dropout_rate'],is_train)
        # self.lstm_keep_prob = self.get_keep_rate(self.config['lstm_dropout'],is_train)
        self.attention_keep_prob = self.get_keep_rate(self.config['attention_dropout'],is_train)

        model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_train,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=False
        )
        output_layer = model.get_sequence_output()
        seq_length = shape(output_layer, -2)

        #辅助
        detect_batch_size = shape(a_input_ids, 0)
        detect_a_batch_size = shape(a_input_ids, 1)
        a_input_ids = tf.reshape(a_input_ids, [detect_batch_size * detect_a_batch_size, -1])
        a_input_mask = tf.reshape(a_input_mask, [detect_batch_size * detect_a_batch_size, -1])
        a_labels_ids = tf.reshape(a_labels_ids, [detect_batch_size * detect_a_batch_size, -1])
        a_segment_ids =tf.reshape(a_segment_ids,[detect_batch_size * detect_a_batch_size, -1])
        a_model = modeling.BertModel(
            config=self.bert_config,
            is_training=is_train,
            input_ids=a_input_ids,
            input_mask=a_input_mask,
            token_type_ids=a_segment_ids,
            use_one_hot_embeddings=False
        )
        a_output_layer=a_model.get_sequence_output()

        a_labels_emb = self.label_embedding_layer(a_labels_ids,True)
        label_emb_size = shape(a_labels_emb, -1)
        a_labels_emb = tf.reshape(a_labels_emb, [detect_batch_size, detect_a_batch_size, -1, label_emb_size])

        a_hidden_size = shape(a_output_layer, -1)
        a_hidden_input = tf.reshape(a_output_layer, [detect_batch_size, detect_a_batch_size, -1, a_hidden_size])

        hidden_input = tf.expand_dims(output_layer, 1)
        a_hidden_input = tf.transpose(a_hidden_input, [0, 1, 3, 2])
        temp_feature = tf.matmul(hidden_input, a_hidden_input)
        prob_feature = tf.nn.softmax(temp_feature)
        aug_represent = tf.matmul(prob_feature, a_labels_emb)
        aug_represent = tf.reduce_mean(aug_represent, axis=1)

        final_represent = tf.concat([output_layer, aug_represent], 2)
        final_represent = tf.nn.dropout(final_represent,keep_prob=self.keep_prob)
        
        attention_outputs = self.self_attention(final_represent,input_mask,keep_prob=self.attention_keep_prob)
        
        with tf.variable_scope("logits"):

            final_size=shape(attention_outputs,-1)

            output_weight = tf.get_variable(
                "output_weights", [self.num_tags,final_size],
                initializer=tf.truncated_normal_initializer(stddev=0.02)
            )
            output_bias = tf.get_variable(
                "output_bias", [self.num_tags], initializer=tf.zeros_initializer()
            )
            output_layer = tf.reshape(attention_outputs, [-1, final_size])
            logits = tf.matmul(output_layer,output_weight, transpose_b=True)
            logits = tf.reshape(logits,[-1, seq_length, self.num_tags])

            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, shape=(-1, seq_length, self.num_tags))

            return logits