def biLSTM_layer(self, lstm_inputs,lstm_dim,lengths,num_layers,keep_prob=1.): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ batch_size=shape(lstm_inputs,0) with tf.variable_scope("BiLSTM"): for layer in range(num_layers): with tf.variable_scope("layer_{}".format(layer)): with tf.variable_scope("forward"): cell_fw = CustomLSTMCell(lstm_dim,batch_size,keep_prob) with tf.variable_scope("backward"): cell_bw = CustomLSTMCell(lstm_dim,batch_size,keep_prob) state_fw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_fw.initial_state.c, [batch_size, 1]), tf.tile(cell_fw.initial_state.h, [batch_size, 1])) state_bw = tf.contrib.rnn.LSTMStateTuple(tf.tile(cell_bw.initial_state.c, [batch_size, 1]), tf.tile(cell_bw.initial_state.h, [batch_size, 1])) (fw_outputs, bw_outputs), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=lstm_inputs, sequence_length=lengths, initial_state_fw=state_fw, initial_state_bw=state_bw ) text_outputs = tf.concat([fw_outputs, bw_outputs], 2) # [num_sentences, max_sentence_length, emb] text_outputs = tf.nn.dropout(text_outputs,keep_prob) if layer > 0: highway_gates = tf.sigmoid(projection(text_outputs,shape(text_outputs, 2))) # [num_sentences, max_sentence_length, emb] text_outputs = highway_gates * text_outputs + (1 - highway_gates) * lstm_inputs lstm_inputs=text_outputs return lstm_inputs
def _shift(BD): #选元素 """ convert: -3 -2 -1 0 1 2 -3 -2 -1 0 1 2 -3 -2 -1 0 1 2 to: 0 1 2 -1 0 1 -2 -1 0 """ bsz = shape(BD, 0) n_head = shape(BD, 1) max_len = shape(BD, 2) zero_pad = tf.zeros(shape=(bsz, n_head, max_len, 1)) BD = tf.reshape(tf.concat([BD, zero_pad], axis=-1), shape=(bsz, n_head, -1, max_len)) BD = tf.reshape(BD[:, :, :-1], shape=(bsz, n_head, max_len, -1)) BD = BD[:, :, :, max_len:] return BD
def project_layer(self, lstm_outputs): """ hidden layer between lstm layer and logits :param lstm_outputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, num_tags] """ num_steps = shape(lstm_outputs, 1) hidden_size = shape(lstm_outputs, 2) with tf.variable_scope("project"): output = tf.reshape(lstm_outputs, shape=[-1, hidden_size]) with tf.variable_scope("logits"): W = tf.get_variable("W", shape=[hidden_size, self.num_tags], dtype=tf.float32, initializer=self.initializer) b = tf.get_variable("b", shape=[self.num_tags], dtype=tf.float32, initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(output, W, b) return tf.reshape(pred, [-1, num_steps, self.num_tags])
def adapting_transformer_layer(self, batch_input, mask, ffnn_size, num_heads=8, attn_blocks_num=2, attention_keep_prob=1.0, ffnn_keep_prob=1.0): attn_outs = batch_input attention_size = shape(attn_outs, -1) local_bias = get_local_bias(shape(attn_outs, 1), 1) for block_id in range(attn_blocks_num): with tf.variable_scope("num_blocks_{}".format(block_id)): if block_id == 0: attn_outs = local_multi_head_attention( attn_outs, mask, attention_size, num_heads, attention_keep_prob, reuse=False, local_bias=local_bias) else: attn_outs = relative_multi_head_attention( attn_outs, mask, attention_size, num_heads, attention_keep_prob, reuse=False) attn_outs = feedforward(attn_outs, [ffnn_size, attention_size], ffnn_keep_prob, reuse=False) return attn_outs
def relative_multi_head_attention(x, mask, attention_size, num_heads, drop_keep_rate=1.0, reuse=None): # borrowed from: https://github.com/Kyubyong/transformer/blob/master/modules.py with tf.variable_scope("relative_multi_head_attention", reuse=reuse): # attention size must consistent with queries(keys)'s -1 dim batch_size = shape(x, 0) # attention_size = x.get_shape().as_list()[-1] max_time = shape(x, 1) pos_embed = relative_positional_encoding(max_time, attention_size // num_heads, True) # linear projections, shape=(batch_size, max_time, attention_size) query = tf.layers.dense( x, attention_size, use_bias=False, name="query_project", kernel_initializer=tf.contrib.layers.xavier_initializer()) # query = tf.layers.dense(x, attention_size, activation=tf.nn.relu, name="query_project", # kernel_initializer=tf.contrib.layers.xavier_initializer()) # key do not dense in this model key = x # value = tf.layers.dense(x, attention_size, activation=tf.nn.relu, name="value_project", # kernel_initializer=tf.contrib.layers.xavier_initializer()) value = tf.layers.dense( x, attention_size, use_bias=False, name="value_project", kernel_initializer=tf.contrib.layers.xavier_initializer()) # split and concatenation, shape=(batch_size, num_heads, max_time, attention_size / num_heads) query_ = tf.stack(tf.split(query, num_heads, axis=2), axis=1) key_ = tf.stack(tf.split(key, num_heads, axis=2), axis=1) # value_ = tf.concat(tf.split(value, num_heads, axis=2), axis=0) value_ = tf.stack(tf.split(value, num_heads, axis=2), axis=1) # shape =(num_heads, attention_size / num_heads) u = tf.get_variable('var_u', shape=[num_heads, attention_size // num_heads], initializer=tf.glorot_normal_initializer()) v = tf.get_variable('var_v', shape=[num_heads, attention_size // num_heads], initializer=tf.glorot_normal_initializer()) Qu = query_ + u[:, None] QKuK = tf.einsum('bnqd,bnkd->bnqk', Qu, key_) vR = tf.einsum('nd,ld->nl', v, pos_embed)[None, :, None] QR = tf.einsum('bnqd,ld->bnql', query_, pos_embed) QRvR = QR + vR QRvR = _shift(QRvR) # attn_outs = QKuK + QRvR #[batch_size,num_heads,max_time,max_time] # attn_outs = tf.reshape(attn_outs, shape=(batch_size*num_heads, max_time, max_time)) # attn_outs = tf.concat(tf.unstack(attn_outs, axis=1), axis=0) # activation #apply talking heads before softmax # pre_softmax_weight=tf.get_variable('pre_softmax_weight',shape=[num_heads,num_heads],initializer=tf.glorot_normal_initializer()) # attn_outs=tf.einsum("BNFT,NL->BLFT", attn_outs,pre_softmax_weight) #print(mask) ret = (1.0 - tf.cast(mask, tf.float32)) * -1e9 bias = tf.expand_dims(tf.expand_dims(ret, 1), 1) attn_outs += bias attn_outs = tf.nn.softmax(attn_outs) #apply talking heads after softmax # post_softmax_weight=tf.get_variable('post_softmax_weight',shape=[num_heads,num_heads],initializer=tf.glorot_normal_initializer()) # attn_outs=tf.einsum("BNFT,NL->BLFT", attn_outs,post_softmax_weight) # dropout attn_outs = tf.nn.dropout(attn_outs, drop_keep_rate) # attn_outs = tf.concat(tf.unstack(attn_outs, axis=1), axis=0) # weighted sum outputs = tf.matmul(attn_outs, value_) # restore shape # outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) outputs = _combine_heads(outputs) # outputs = tf.layers.dense(outputs,attention_size,use_bias=False,name="output_project", # kernel_initializer=tf.contrib.layers.xavier_initializer()) # residual connection outputs += x outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6, dtype="float32")(outputs) return outputs
def __init__(self, config, data): self.config = config self.data = data self.num_tags = data.label_alphabet_size self.gaz_emb_dim = data.gaz_emb_dim self.word_emb_dim = data.word_emb_dim self.biword_emb_dim = data.biword_emb_dim # 参数初始化 self.initializer = initializers.xavier_initializer() # add placeholders for the model self.is_train = tf.placeholder(dtype=tf.bool, shape=[], name='is_train') self.word_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="word_inputs") self.biword_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='biword_inputs') self.mask = tf.placeholder(dtype=tf.int32, shape=[None, None], name='mask') self.word_seq_lengths = tf.placeholder(dtype=tf.int32, shape=[None], name="word_seq_lengths") self.batch_label = tf.placeholder(dtype=tf.int32, shape=[None, None], name="batch_label") self.layer_gaz = tf.placeholder(dtype=tf.int32, shape=[None, None, 4, None], name="layer_gaz") self.gaz_mask_input = tf.placeholder(dtype=tf.int32, shape=[None, None, 4, None], name="gaz_mask_input") self.gaz_count = tf.placeholder(dtype=tf.int32, shape=[None, None, 4, None], name="gaz_count") self.embedding_keep_prob = self.get_keep_rate( self.config['embedding_dropout'], self.is_train) self.fc_keep_prob = self.get_keep_rate(self.config['fc_dropout'], self.is_train) self.attention_keep_prob = self.get_keep_rate( self.config['attention_dropout'], self.is_train) self.ffnn_keep_prob = self.get_keep_rate(self.config['ffnn_dropout'], self.is_train) batch_size = shape(self.word_inputs, 0) seq_len = shape(self.word_inputs, 1) #word word_embs = self.word_embedding_layer(self.word_inputs, data.word_alphabet.size(), self.word_emb_dim) word_embs = word_embs * tf.expand_dims( tf.cast(self.mask, dtype=tf.float32), -1) #biword biword_embs = self.biword_embedding_layer(self.biword_inputs, data.biword_alphabet.size(), self.biword_emb_dim) biword_embs = biword_embs * tf.expand_dims( tf.cast(self.mask, dtype=tf.float32), -1) word_inputs_d = tf.concat([word_embs, biword_embs], -1) word_inputs_d = tf.nn.dropout(word_inputs_d, self.embedding_keep_prob) #gaz gaz_embeds = self.gaz_embedding_layer(self.layer_gaz, data.gaz_alphabet.size(), self.gaz_emb_dim) gaz_embeds = tf.nn.dropout(gaz_embeds, self.embedding_keep_prob) gaz_embeds = gaz_embeds * (1.0 - tf.expand_dims( tf.cast(self.gaz_mask_input, dtype=tf.float32), -1)) # #dropout count_sum = tf.reduce_sum(self.gaz_count, 3, keepdims=True) #(b,l,4,gn)每个位置的单词总数 count_sum = tf.reduce_sum(count_sum, 2, keepdims=True) #(b,l,1,1)#4个位置也要算? weights = tf.divide(self.gaz_count, count_sum) #(b,l,4,g)tf.int32/tf.int32=tf.float64 weights = weights * 4 weights = tf.cast(tf.expand_dims(weights, -1), tf.float32) gaz_embeds = weights * gaz_embeds #(b,l,4,g,e) gaz_embeds = tf.reduce_sum(gaz_embeds, 3) #(b,l,4,e) gaz_embeds_cat = tf.reshape( gaz_embeds, (batch_size, seq_len, 4 * self.gaz_emb_dim)) #(b,l,4*ge) l=length word_input_cat = tf.concat([word_inputs_d, gaz_embeds_cat], -1) #(b,l,we+4*ge) # inputs = tf.layers.dense(word_input_cat,self.config['attention_size'], name='input_fc', # kernel_initializer=self.initializer) #on-lstm inputs = self.onLSTM_layer(word_input_cat, self.config['attention_size'], self.word_seq_lengths, 1, 16) outputs = self.adapting_transformer_layer( inputs, self.mask, self.config['ffnn_size'], self.config['num_heads'], self.config['attn_blocks_num'], self.attention_keep_prob, self.ffnn_keep_prob) # fc_dropout outputs = tf.nn.dropout(outputs, self.fc_keep_prob) # 分类 self.logits = self.project_layer(outputs) # crf计算损失 self.loss, self.trans = self.loss_layer(self.logits, self.word_seq_lengths) num_train_steps = math.ceil( self.config['train_examples_len'] / self.config["batch_size"]) * self.config["epochs"] num_warmup_steps = int(num_train_steps * self.config['warmup_proportion']) self.global_step = tf.train.get_or_create_global_step() trainable_params = tf.trainable_variables() for var in trainable_params: print(" trainable_params name = %s, shape = %s" % (var.name, var.shape)) self.train_op = optimization.create_optimizer( trainable_params, self.loss, self.config['other_learning_rate'], self.config['crf_learning_rate'], num_train_steps, num_warmup_steps, self.global_step) # saver of the model self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def get_predictions(self,input_ids,input_mask,input_lens,segment_ids,a_input_ids,a_labels_ids,a_input_mask,a_input_lens,a_segment_ids,is_train): self.keep_prob = self.get_keep_rate(self.config['dropout_rate'],is_train) # self.lstm_keep_prob = self.get_keep_rate(self.config['lstm_dropout'],is_train) self.attention_keep_prob = self.get_keep_rate(self.config['attention_dropout'],is_train) model = modeling.BertModel( config=self.bert_config, is_training=is_train, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False ) output_layer = model.get_sequence_output() seq_length = shape(output_layer, -2) #辅助 detect_batch_size = shape(a_input_ids, 0) detect_a_batch_size = shape(a_input_ids, 1) a_input_ids = tf.reshape(a_input_ids, [detect_batch_size * detect_a_batch_size, -1]) a_input_mask = tf.reshape(a_input_mask, [detect_batch_size * detect_a_batch_size, -1]) a_labels_ids = tf.reshape(a_labels_ids, [detect_batch_size * detect_a_batch_size, -1]) a_segment_ids =tf.reshape(a_segment_ids,[detect_batch_size * detect_a_batch_size, -1]) a_model = modeling.BertModel( config=self.bert_config, is_training=is_train, input_ids=a_input_ids, input_mask=a_input_mask, token_type_ids=a_segment_ids, use_one_hot_embeddings=False ) a_output_layer=a_model.get_sequence_output() a_labels_emb = self.label_embedding_layer(a_labels_ids,True) label_emb_size = shape(a_labels_emb, -1) a_labels_emb = tf.reshape(a_labels_emb, [detect_batch_size, detect_a_batch_size, -1, label_emb_size]) a_hidden_size = shape(a_output_layer, -1) a_hidden_input = tf.reshape(a_output_layer, [detect_batch_size, detect_a_batch_size, -1, a_hidden_size]) hidden_input = tf.expand_dims(output_layer, 1) a_hidden_input = tf.transpose(a_hidden_input, [0, 1, 3, 2]) temp_feature = tf.matmul(hidden_input, a_hidden_input) prob_feature = tf.nn.softmax(temp_feature) aug_represent = tf.matmul(prob_feature, a_labels_emb) aug_represent = tf.reduce_mean(aug_represent, axis=1) final_represent = tf.concat([output_layer, aug_represent], 2) final_represent = tf.nn.dropout(final_represent,keep_prob=self.keep_prob) attention_outputs = self.self_attention(final_represent,input_mask,keep_prob=self.attention_keep_prob) with tf.variable_scope("logits"): final_size=shape(attention_outputs,-1) output_weight = tf.get_variable( "output_weights", [self.num_tags,final_size], initializer=tf.truncated_normal_initializer(stddev=0.02) ) output_bias = tf.get_variable( "output_bias", [self.num_tags], initializer=tf.zeros_initializer() ) output_layer = tf.reshape(attention_outputs, [-1, final_size]) logits = tf.matmul(output_layer,output_weight, transpose_b=True) logits = tf.reshape(logits,[-1, seq_length, self.num_tags]) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, shape=(-1, seq_length, self.num_tags)) return logits