def build_model(self): temp = self.all_sequence[-1] with tf.variable_scope("lstm"): temp = dropout(temp, 0.1) seq_len = tf.reduce_sum(self.sent_mask, axis=1) gru_fw = GRUCell(num_units=768, activation=tf.tanh) gru_bw = GRUCell(num_units=768, activation=tf.tanh) outputs, output_states = bidirectional_dynamic_rnn( gru_fw, gru_bw, temp, sequence_length=seq_len, dtype=tf.float32) gru_output = tf.concat(outputs, axis=2) # gru_output = dropout(gru_output, 0.1) gru_output = tf.layers.dense(gru_output, units=768, kernel_initializer=create_initializer(0.02)) gru_output = dropout(gru_output, 0.1) outputs = layer_norm(gru_output + temp) in_outputs = tf.layers.dense(outputs, units=768, activation=tf.tanh, kernel_initializer=create_initializer(0.02)) layer_output = tf.layers.dense(in_outputs, 768, kernel_initializer=create_initializer(0.02)) layer_output = dropout(layer_output, 0.1) layer_output = layer_norm(layer_output + outputs) return layer_output
def transformer_model(input_tensor, neg_attention_mask, num_hidden_layers=12, intermediate_act_fn=gelu): hidden_size = num_attention_heads * size_per_head neg_attention_mask = tf.reshape(neg_attention_mask, [batch_size, 1, 1, seq_length]) neg_attention_mask *= tf.ones(shape=[batch_size, num_attention_heads, seq_length, seq_length], dtype=tf.float32) prev_output = input_tensor for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.variable_scope("attention"): with tf.variable_scope("self"): attention_head = attention_layer( layer_input, neg_attention_mask, query_kernel=query_kernel, query_bias=query_bias, key_kernel=key_kernel, key_bias=key_bias, value_kernel=value_kernel, value_bias=value_bias, num_attention_heads=num_attention_heads, size_per_head=size_per_head, batch_size=batch_size, seq_length=seq_length) with tf.variable_scope("output"): attention_output = tf.layers.Dense( hidden_size, weights=[attention_output_kernel, attention_output_bias] ).apply(attention_head) attention_output = layer_norm(attention_output + layer_input, beta=attention_norm_beta, gamma=attention_norm_gamma) with tf.variable_scope("intermediate"): intermediate_output = tf.layers.Dense( intermediate_size, activation=intermediate_act_fn, weights=[intermediate_kernel, intermediate_bias] ).apply(attention_output) with tf.variable_scope("output"): layer_output = tf.layers.Dense( hidden_size, weights=[output_kernel, output_bias] ).apply(intermediate_output) layer_output = layer_norm(layer_output + attention_output, beta=output_norm_beta, gamma=output_norm_gamma) prev_output = layer_output return prev_output
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) # sampled_logits = tf.multinomial(logits, 1) return log_probs
def get_candi_output(bert_config, input_tensor, output_weights): """Get loss and log probs for the masked LM.""" # input_tensor = gather_indexes(input_tensor, positions) sequence_shape = modeling.get_shape_list(input_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] input_tensor = tf.reshape(input_tensor, [batch_size * seq_length, width]) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) #batch*seq,vocab_size logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) log_probs = tf.reshape(log_probs, [batch_size, seq_length, -1]) _, top_k_idx = tf.nn.top_k(log_probs, FLAGS.top_k) return top_k_idx
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM. input_tensor --> [batch_size, seq_length, hidden_size] output_weights --> [vocab_size, embedding_size] positions --> [batch_size, max_predictions_per_seq] label_ids --> [batch_size, max_predictions_per_seq] label_weights --> [batch_size, max_predictions_per_seq] """ tf.logging.info(f'get_masked_lm_output--positions:{positions}') input_tensor = gather_indexes( input_tensor, positions) # [batch_size*max_predictions_per_seq, hidden_size] with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm( input_tensor ) # [batch_size*max_predictions_per_seq, hidden_size] # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul( input_tensor, output_weights, transpose_b=True ) # [batch_size*max_predictions_per_seq, vocab_size] logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax( logits, axis=-1) # [batch_size*max_predictions_per_seq, vocab_size] label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape( label_weights, [-1]) # [batch_size*max_predictions_per_seq] one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32 ) # [batch_size*max_predictions_per_seq, vocab_size] # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # 交叉熵 [flat_positions] numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def add_embeddings(self): with tf.name_scope("embedding"): if self.is_Embedding_Needed: W = tf.Variable(np.array(self.embeddings),name="word_embed" ,dtype="float32",trainable = self.trainable ) else: W=tf.get_variable( name='word_embed', shape=[self.vocab_size, self.embedding_size], initializer=modeling.create_initializer(0.02),trainable=True) self.embedding_W = W self.embedded_chars_q_pos= self.get_timing_signal_1d(self.max_input_left, self.embedding_size) if 'adding_problem' not in self.dataset: self.embedded_chars_q = tf.nn.embedding_lookup(self.embedding_W,self.question) else: #mapping 2 dim into high dim if self.embedding_size == 2: self.embedded_chars_q = self.question else: self.embedded_chars_q = tf.layers.dense(self.question,self.embedding_size) print('embedded_chars_q:',self.embedded_chars_q) if 'adding_problem' not in self.dataset: self.embedded_chars_q = modeling.layer_norm( tf.nn.dropout(self.embedded_chars_q, keep_prob=1.0-self.input_dropout_prob)) #add the position embedding may be lead to poor performance... self.embedded_chars_q= self.embedded_chars_q +self.embedded_chars_q_pos print('embedded_chars_q:',self.embedded_chars_q)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) # TODO: dynamic gather from per_example_loss return loss
def get_masked_lm_output(self, bert_config, input_tensor, output_weights, positions, label_ids): input_tensor, size, max_len = self.gather_indexes( input_tensor, positions) with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) return loss
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable('output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-05 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def add_embeddings(self): with tf.name_scope("embedding"): if self.is_Embedding_Needed: W = tf.Variable(np.array(self.embeddings), name="word_embed", dtype="float32", trainable=self.trainable) else: W = tf.get_variable( name='word_embed', shape=[self.vocab_size, self.embedding_size], initializer=modeling.create_initializer(0.02), trainable=True) if 'adding_problem' not in self.dataset: self.embedding_W = W self.embedded_chars_q = tf.nn.embedding_lookup( self.embedding_W, self.question) else: if self.embedding_size == 2: self.embedded_chars_q = self.question else: self.embedded_chars_q = tf.layers.dense( self.question, self.embedding_size) print('embedded_chars_q:', self.embedded_chars_q) if 'adding_problem' not in self.dataset: self.embedded_chars_q = modeling.layer_norm( tf.nn.dropout(self.embedded_chars_q, keep_prob=1.0 - self.input_dropout_prob))
def build_model(self): with tf.variable_scope("inferring_module"), tf.device("/device:GPU:0"): rdim = 768 update_num = 3 batch_size = tf.shape(self.sent1)[0] dim = self.sent1.get_shape().as_list()[-1] gru_layer = BiGRU(num_layers=1, num_units=rdim, batch_size=batch_size, input_size=dim, keep_prob=0.9, is_train=self.is_training, activation=tf.nn.tanh) seq_len = tf.reduce_sum(self.input_mask, axis=1) gru_output = gru_layer(self.all_sent, seq_len=seq_len) with tf.variable_scope("att"): all_seq_len = self.all_sent.get_shape().as_list()[1] cls = tf.tile(tf.expand_dims(self.mark0, axis=1), [1, all_seq_len, 1]) cat_att = tf.concat([cls, gru_output], axis=2) res = tf.layers.dense(cat_att, units=512, activation=tf.nn.relu) res = tf.layers.dense(res, units=1, use_bias=False) res_mask = tf.expand_dims(tf.cast(self.input_mask, tf.float32), axis=2) res = res - (1 - res_mask) * 10000.0 alpha = tf.nn.softmax(res, 1) gru_vec = tf.reduce_sum(alpha * gru_output, axis=1) # gru_vec = dropout(gru_vec, self.dropout_rate) gru_vec = tf.layers.dense( gru_vec, 768, activation=gelu, kernel_initializer=create_initializer(0.02)) gru_vec = dropout(gru_vec, self.dropout_rate) gru_vec = layer_norm(gru_vec + self.mark0) gru_vec = tf.layers.dense( gru_vec, 768, activation=tf.tanh, kernel_initializer=create_initializer(0.02)) # gate = tf.layers.dense(tf.concat([gru_vec, self.mark0], axis=1), # rdim, activation=tf.sigmoid, # kernel_initializer=create_initializer(0.02)) # with tf.variable_scope("merge"): # # refer_output = self.mark0 * gate + (1 - gate) * gru_vec # vec_cat = tf.concat([self.mark0, gru_vec], axis=1) # vec_cat = dropout(vec_cat, self.dropout_rate) # pooled_output = tf.layers.dense(vec_cat, 768, # activation=tf.tanh, # kernel_initializer=create_initializer(0.02)) return gru_vec
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """ 下游任务,遮蔽语言模型网络结构,获取遮蔽语言模型的loss与对数概率值 :param bert_config: bert配置,描述bert网络结构 :param input_tensor: 输入向量 [batch_size, seq_len, embedding_dim] :param output_weights: 输出权重,bert的词向量权重 [voc_size, embedding_dim] :param positions: 遮蔽的位置 [batch_size, masked_len] :param label_ids: 遮蔽的标签 [batch_size, masked_len] :param label_weights: 遮蔽的权重 [batch_size, masked_len] :return: 整体loss,每个样本的loss [batch_size*masked_len],预测的对数概率值 [batch_size*masked_len, voc_size] """ # 获取输入张量中对应位置的值, [batch_size * masked_len, embedding_dim] input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # 在输出层之前,还额外进行了一次非线性转换映射,预训练完之后,这个映射不再使用 with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm( input_tensor) # 层标准化 [batch_size * masked_len, embedding_dim] # 输出权重跟输入的embedding很相似,只是输出多了个bias output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul( input_tensor, output_weights, transpose_b=True) # [batch_size * masked_len, voc_size] logits = tf.nn.bias_add( logits, output_bias) # [batch_size * masked_len, voc_size] log_probs = tf.nn.log_softmax( logits, axis=-1) # [batch_size * masked_len, voc_size] label_ids = tf.reshape(label_ids, [-1]) # [batch_size * masked_len] label_weights = tf.reshape(label_weights, [-1]) # [batch_size * masked_len] # [batch_size * masked_len, voc_size] one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # 如果预测的数量小于最大数量,会补齐至最大数量,补齐的weight值为0,其他为1 # 每一个样本的loss,[batch_size * masked_len] per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # 根据权重计算总loss numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator #计算平均loss return (loss, per_example_loss, log_probs)
def add_embeddings(self): with tf.name_scope("embedding"): if self.is_Embedding_Needed: W = tf.get_variable(name="embeddings", dtype="float32", initializer=np.array( self.embeddings, np.float32), trainable=self.trainable) else: #I think we need to utilize more fine-grained word embedding~ W = tf.get_variable( name='word_embed', shape=[self.vocab_size, self.embedding_size], initializer=modeling.create_initializer(0.02), trainable=True) if 'adding_problem' not in self.dataset: self.embedding_W = W self.embedded_chars_q = tf.nn.embedding_lookup( self.embedding_W, self.question) else: #mapping 2 dim into high dim if self.embedding_size == 2: self.embedded_chars_q = self.question else: self.embedded_chars_q = tf.layers.dense( self.question, self.embedding_size) print('embedded_chars_q:', self.embedded_chars_q) if 'adding_problem' not in self.dataset: self.embedded_chars_q = modeling.layer_norm( tf.nn.dropout(self.embedded_chars_q, keep_prob=1.0 - self.input_dropout_prob)) #(0,1) self.soft_t5_rd_bucket_mat = tf.sigmoid( tf.get_variable( 't5_rd_bucket_mat', [2 * self.max_input_left, self.config.num_attention_heads], initializer=modeling.create_initializer(0.1), trainable=True)) self.single_t5_att_bias = compute_bias( self.config.num_attention_heads, self.max_input_left, self.max_input_left, self.soft_t5_rd_bucket_mat, l1_width=self.config.l1_width, l2_width=self.config.l2_width, stddev=self.config.stddev, bidirectional=True) self.t5_att_bias = tf.tile(self.single_t5_att_bias, [tf.shape(self.question)[0], 1, 1, 1], name='t5_att_bias') print('[!!!--t5_bias:]', self.t5_att_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): # input_tensor=model.get_sequence_output() # output_weights=model.get_embedding_table() # positions=masked_lm_positions # label_ids=masked_lm_ids # label_weights=masked_lm_weights # 这里的input_tensor是模型中传回的最后一层结果 [batch_size,seq_length,hidden_size]。 # #output_weights是词向量表 [vocab_size,embedding_size] """Get loss and log probs for the masked LM.""" #获取positions位置的所有encoder(即要预测的那些位置的encoder) input_tensor = gather_indexes( input_tensor, positions) #[batch_size*max_pred_pre_seq,hidden_size] with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( #传入一个全连接层 输出shape [batch_size*max_pred_pre_seq,hidden_size] input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul( input_tensor, output_weights, transpose_b=True) #output_weights是embedding层 output_weights进行转置 logits = tf.nn.bias_add( logits, output_bias ) #[batchsize*max_pred_pre_seq,vocal_size] 每个mask词在词表中的概率 log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" ''' (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) ''' input_tensor = gather_indexes(input_tensor, positions) # bert mask掉的位置 # 输出是bs*L, hidden size with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm( input_tensor) # mask一个序列,所以需要layer norm # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul( input_tensor, output_weights, transpose_b=True) # word embedding相当于label embedding logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) # mask掉的词的真实id label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # weight主要是考虑了有些mask位置是为了补足而pad的 per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # shape = bs, L numerator = tf.reduce_sum(label_weights * per_example_loss) # shape = bs denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" import ipdb ipdb.set_trace() input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) #input_tensor.shpae=(160,768),output_weights.shape=(21128(vocab_size),768) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) #logits.shape=(160,21128) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) #label_ids.shape = (8,20) label_ids = tf.reshape(label_ids, [-1]) #label_ids.shape = (160) #label_weights.shape=(8,20) label_weights = tf.reshape(label_weights, [-1]) #label_weights是mask的权重, #在本程序中,都是1 #label_weights.shape=(160,) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) #one_hot_labels.shape=(160,21128),一共160个字符,每个字符用vocab_size的 #one_hot表示,为下文求loss做准备。 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" # [batch_size*label_size, dim] input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range) ) ##tf.layers.dense adds a single layer to network. The second argument is the number of neurons/nodes of the layer. input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. (equation 7) output_bias = tf.get_variable("output_bias", shape=[output_weights.shape[0]], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) #values are not probs (sum>1) logits = tf.nn.bias_add(logits, output_bias) # logits, (bs*label_size, vocab_size) log_probs = tf.nn.log_softmax( logits, -1 ) #log to compute log liklihood (Eq. 8). -1 indicates the last dimension. label_ids = tf.reshape(label_ids, [-1]) #shape of [-1] flattens into 1-D label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=output_weights.shape[0], dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( log_probs * one_hot_labels, axis=[-1]) #loss per each masked position in the seq numerator = tf.reduce_sum( label_weights * per_example_loss) #loss over all serialized sequence denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" # 得到MASK的LM的 loss和log概率 input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. # 在输出之前再加一个非线性变换,这些参数只是用于训练,在Fine-Tuning的时候就不用了。 with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. # output_weights是复用输入的word Embedding,所以是传入的, # 这里再多加一个bias。 output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) # label_ids的长度是20,表示最大的MASK的Token数 # label_ids里存放的是MASK过的Token的id label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # 但是由于实际MASK的可能不到20,比如只MASK18,那么label_ids有2个0(padding) # 而label_weights=[1, 1, ...., 0, 0],说明后面两个label_id是padding的,计算loss要去掉。 per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_loss(model_config, seq_output, embedding_table, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3]) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(seq_output, [batch_size * seq_length, width]) seq_output = tf.gather(flat_sequence_tensor, flat_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( seq_output, units=model_config.embedding_size, activation=modeling.get_activation(model_config.hidden_act), kernel_initializer=modeling.create_initializer( model_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[model_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, embedding_table, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=model_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss, per_example_loss, log_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): '''Get loss and log probs for the masked LM''' input_tensor = gather_indexes(input_tensor, positions) # with tf.variable_scope('cls/predictions'): with tf.name_scope('cls/predictions'): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. # with tf.variable_scope('transform'): with tf.name_scope('transform'): # input_tensor = tf.layers.dense( input_tensor = tf.keras.layers.Dense( units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range))(input_tensor) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. # output_bias = tf.get_variable( # 'output_bias', # shape=[bert_config.vocab_size], # initializer=tf.zeros_initializer() # ) output_bias = tf.Variable( initial_value=tf.zeros([bert_config.vocab_size]), name='output_bias', shape=[bert_config.vocab_size], # initializer=tf.zeros_initializer() ) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes( input_tensor, positions) ###[batch*mask_length,hidden_size] ###返回的是所有mask位置对应的输出的transformer向量 with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( #####[batch*mask_length,hidden_size] input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation( bert_config.hidden_act), ###Chinese config gelu kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) ###[batch*mask_length,hidden_size] matmul with [vocab_size,hidden_size] ###[batch*mask_length,vocab_size] logits = tf.matmul(input_tensor, output_weights, transpose_b=True) #[batch*mask_length,vocab_size] logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax( logits, axis=-1) ###[batch*mask_length,vocab_size] label_ids = tf.reshape(label_ids, [-1]) ###[batch*mask_length] label_weights = tf.reshape(label_weights, [-1]) ###[batch*mask_length] one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) ###[batch*mask_length,vocab_size] # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) ###[batch*mask_length] numerator = tf.reduce_sum(label_weights * per_example_loss) ###标量loss mask result denominator = tf.reduce_sum(label_weights) + 1e-5 ###这是为了转为float?????? loss = numerator / denominator #average loss return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) if bert_config.loss == "original": log_probs = tf.nn.log_softmax(logits, axis=-1) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = tf.identity(numerator / denominator, name="loss") elif bert_config.loss == "focal": log_probs = tf.nn.softmax(logits, axis=-1) per_example_loss = -tf.reduce_sum(one_hot_labels * ( (1 - log_probs)**2) * tf.log(log_probs), axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = tf.identity(numerator / denominator, name="loss") return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) # 下面要注意,output_weights就是modeling中的embedding table,其shape是 [vocab_size, hidden_size],下面相乘时做一下转置即可。 # 这样输出的logits的第二维就是 vocab_size logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) # 注意这里将softmax和取log合在了一起 log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # 这段英文注释是理解label_weights的关键 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # 这里per_example_loss 就是 -logP P是预测成label的概率 per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # label_weights这里的作用是将padding出来的mask position的loss清零 numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def add_embeddings(self): with tf.name_scope("embedding"): if self.is_Embedding_Needed: W = tf.Variable(np.array(self.embeddings), name="word_embed", dtype="float32", trainable=self.trainable) else: W = tf.get_variable( name='word_embed', shape=[self.vocab_size, self.embedding_size], initializer=modeling.create_initializer(0.02), trainable=True) if 'adding_problem' not in self.dataset: self.embedding_W = W self.embedded_chars_q = tf.nn.embedding_lookup( self.embedding_W, self.question) else: #mapping 2 dim into high dim if self.embedding_size == 2: self.embedded_chars_q = self.question else: self.embedded_chars_q = tf.layers.dense( self.question, self.embedding_size) print('embedded_chars_q:', self.embedded_chars_q) if 'adding_problem' not in self.dataset: self.embedded_chars_q = modeling.layer_norm( tf.nn.dropout(self.embedded_chars_q, keep_prob=1.0 - self.input_dropout_prob)) context_position = tf.range(self.max_input_left, dtype=tf.int32)[:, None] memory_postion = tf.range(self.max_input_left, dtype=tf.int32)[None, :] relative_position = memory_postion - context_position + self.max_input_left #why this embedding is very sensitive... self.t5_pos_embedding = tf.get_variable( 't5_pos_mat', [2 * self.max_input_left, self.config.num_attention_heads], initializer=modeling.create_initializer(0.02), trainable=True) self.single_t5_att_bias = compute_bias(relative_position, self.t5_pos_embedding) ## [batch, num_heads, query_length, memory_length] self.t5_att_bias = tf.tile(self.single_t5_att_bias, [tf.shape(self.question)[0], 1, 1, 1]) print('t5_bias:', self.t5_att_bias)
def get_hydrophobicity_output(bert_config, input_tensor, positions, label_hydrophobicities, label_weights, k=3, log=False): """Get loss and log probs for the hydrophobicity prediction.""" input_tensor = gather_indexes(input_tensor, positions) hydrophobicity_range = 155 * k + 1 with tf.variable_scope("cls/hydrophobicity"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_weights = tf.get_variable( "output_weights", shape=[hydrophobicity_range, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_bias = tf.get_variable("output_bias", shape=[hydrophobicity_range], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_hydrophobicities = tf.reshape(label_hydrophobicities, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_hydrophobicities, depth=hydrophobicity_range, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # The weight matrix is not used after pre-training with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_dims, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_std)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as input embeddings, but there is # an bias vector for the output for each token output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) # log_softmax applies logarithm after softmax. #softmax: #exp(x_i) / exp(x).sum() #log_softmax: #log( exp(x_i) / exp(x).sum() ) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # 'label_weights' tensor has a value of 1.0 for # real predictions and 0.0 for the padding predictions. per_example_loss = tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 # compute the loss only from the real predictions loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_lm_output(config, input_tensor, output_weights, label_ids, label_mask): """Get loss and log probs for the LM.""" input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) input_tensor = tf.reshape( input_tensor, [input_shape[0] * input_shape[1], input_shape[2]]) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) label_mask = tf.reshape(label_mask, [input_shape[0] * input_shape[1]]) loss_mask = tf.dtypes.cast(label_mask, tf.float32) per_example_loss = tf.math.multiply(per_example_loss, loss_mask) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def bert_crf(bert_config, is_training, input_ids, segment_ids, input_mask, label_ids, sequence_length, num_labels, use_one_hot_embeddings): batch_size = tf.shape(input_ids)[0] bert_out = bert(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings) # hidden_size = tf.shape(bert_out)[-1] hidden_size = 768 if is_training: bert_out = layer_norm_and_dropout(bert_out, 0.5) else: bert_out = layer_norm(bert_out) bert_out = tf.reshape(bert_out, [-1, hidden_size]) linear_out = linear_layer(bert_out, hidden_size, num_labels, "linear") crf_out = crf_layer(linear_out, label_ids, batch_size, sequence_length, num_labels, max_seq_length, "crf") return crf_out
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)