def get_masked_lm_output(albert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=albert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def __init__(self, config, input_embedding, attention_mask): # Keep variable names the same as BERT with tf.variable_scope("bert"): with tf.variable_scope("encoder"): all_encoder_layers = modeling.transformer_model( input_tensor=input_embedding, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = all_encoder_layers[-1]
def get_mlm_output(input_tensor, albert_config, mlm_positions, output_weights, label_ids, label_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [1, -1]) one_hot_labels = tf.one_hot(label_ids, depth=albert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator masked_lm_log_probs = tf.reshape(log_probs, [-1, log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) # return masked_lm_predictions return loss, per_example_loss
def get_solubility_output(bert_config, input_tensor, positions, label_solubilities, label_weights, k=3, log=False): """Get loss and log probs for the solubility prediction.""" input_tensor = gather_indexes(input_tensor, positions) solubility_range = 100*k + 1 with tf.variable_scope("cls/solubility"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_weights = tf.get_variable( "output_weights", shape=[solubility_range, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[solubility_range], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_solubilities = tf.reshape(label_solubilities, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot( label_solubilities, depth=solubility_range, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def __init__(self, config, tf_dtype, input_hidden, embedding_table): # Keep variable names the same as BERT with tf.variable_scope("cls"): with tf.variable_scope("predictions"): with tf.variable_scope("transform"): self.transformed_output = tf.layers.dense( input_hidden, config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) self.transformed_output = modeling.layer_norm( self.transformed_output) output_bias = tf.Variable(tf.zeros([config.vocab_size], dtype=tf_dtype), name="output_bias", dtype=tf_dtype) self.final_output = tf.add( tf.matmul(self.transformed_output, tf.transpose(embedding_table)), output_bias) self.probs = tf.nn.softmax(self.final_output, name='token_probs')
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights): """From run_pretraining.py.""" input_tensor = gather_indexes(input_tensor, mlm_positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=albert_config.embedding_size, activation=modeling.get_activation(albert_config.hidden_act), kernel_initializer=modeling.create_initializer( albert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[albert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_shuffle_loss(model_config, seq_output, label_ids, label_weights): sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3]) seq_length = sequence_shape[1] width = sequence_shape[2] seq_output = tf.reshape(seq_output, [-1, width]) with tf.variable_scope("cls/shuffle"): with tf.variable_scope("transform"): seq_output = tf.layers.dense( seq_output, units=seq_length, activation=modeling.get_activation(model_config.hidden_act), kernel_initializer=modeling.create_initializer( model_config.initializer_range)) seq_output = modeling.layer_norm(seq_output) output_bias = tf.get_variable("output_bias", shape=[seq_length], initializer=tf.zeros_initializer()) logits = tf.nn.bias_add(seq_output, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(tf.cast(label_weights, tf.float32), [-1]) one_hot_labels = tf.one_hot(label_ids, depth=seq_length, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss, per_example_loss, log_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) # TODO: dynamic gather from per_example_loss return loss
def get_logits(bert_config, input_tensor, output_weights, positions): """Get logits for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) if bert_config.hidden_size != bert_config.embedding_size: extra_output_weights = tf.get_variable( name="extra_output_weights", shape=[ bert_config.vocab_size, bert_config.hidden_size - bert_config.embedding_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_weights = tf.concat( [output_weights, extra_output_weights], axis=1) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) return log_probs
def feed_neural_work(self): ''' input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False''' # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers, self.context_bias = modeling.transformer_model( self.embedded_chars_q, attention_mask=self.attention_mask, hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.config.initializer_range, do_return_all_layers=True, t5_relative_bias=self.t5_att_bias) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained if self.transformer_ret_pooling == "mean": print('self.seq_lent:', self.seq_lent) print('tf.reduce_sum(self.sequence_output,axis=1):', tf.reduce_sum(self.sequence_output, axis=1)) self.pooled_output = tf.reduce_sum(self.sequence_output, axis=1) * self.seq_lent elif self.transformer_ret_pooling == "last": self.pooled_output = self.sequence_output[:, -1, :] elif self.transformer_ret_pooling == "max": self.pooled_output = tf.reduce_max(self.sequence_output, axis=1) else: print('wrong transformer_ret_pooling:', self.transformer_ret_pooling) exit(0) if 'adding_problem' not in self.dataset: #we add dropout for pooled_output self.pooled_output = modeling.layer_norm( tf.nn.dropout(self.pooled_output, keep_prob=1.0 - self.input_dropout_prob)) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[self.config.hidden_size, self.max_input_right], initializer=initializer()) b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]), name="b") l2_loss = tf.constant(0.0) l2_loss += tf.nn.l2_loss(W) self.scores = tf.nn.xw_plus_b(self.pooled_output, W, b, name="scores") print(self.scores) self.predictions = tf.argmax(self.scores, 1, name="predictions") if 'adding_problem' not in self.dataset: # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.l2_loss = l2_loss * self.l2_reg_lambda self.loss = tf.reduce_mean(losses) + self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") else: with tf.name_scope("loss"): losses = tf.nn.l2_loss(self.scores - tf.expand_dims(self.input_y, -1)) print('losses:', losses) self.l2_loss = self.l2_reg_lambda * l2_loss self.loss = tf.reduce_mean(losses) + self.l2_loss * 1e-3 with tf.name_scope("accuracy"): correct_predictions = tf.less_equal( tf.abs(self.scores[:, 0] - self.input_y), tf.constant([0.04])) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def gec_create_model(bert_config, is_training, input_sequence, input_mask, segment_ids, edit_sequence, use_one_hot_embeddings, mode, copy_weight, use_bert_more, insert_ids, multitoken_insert_ids, subtract_replaced_from_replacement): """Creates a classification model.""" # insert_ids: word ids of unigram inserts (list) # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2) # Defining the space of all possible edits: # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively # copy is mapped to 3 # del is mapped to 4 num_appends = len(insert_ids) + len(multitoken_insert_ids) num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts) append_begin = 5 # First append edit (mapped to 5) append_end = append_begin + num_appends - 1 #Last append edit rep_begin = append_end + 1 # First replace edit rep_end = rep_begin + num_replaces - 1 #Last replace edit num_suffix_transforms = 58 #num of transformation edits num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits print("************ num of labels : {} ***************".format(num_labels)) config = bert_config input_sequence_shape = modeling.get_shape_list(input_sequence,2) batch_size = input_sequence_shape[0] seq_len = input_sequence_shape[1] if not use_bert_more: #default use of bert (without logit factorisation) model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() else: # LOGIT FACTORISATION is On! model = modified_modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_sequence, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_sequence_output() replace_layer = output_layer[:,seq_len:2*seq_len,:] #representation of replacement slots as described in paper append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper output_layer = output_layer[:,0:seq_len,:] output_layer_shape = modeling.get_shape_list(output_layer,3) hidden_size = output_layer_shape[-1] flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size]) h_edit = flattened_output_layer if use_bert_more: h_word = flattened_output_layer flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size]) flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size]) m_replace = flattened_replace_layer m_append = flattened_append_layer with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): h_word = tf.layers.dense( h_word, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) h_word = modeling.layer_norm(h_word) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_replace = tf.layers.dense( m_replace, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_replace = modeling.layer_norm(m_replace) with tf.variable_scope("cls/predictions",reuse=True): with tf.variable_scope("transform",reuse=True): m_append = tf.layers.dense( m_append, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) m_append = modeling.layer_norm(m_append) word_embedded_input = model.word_embedded_input flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size]) labels = edit_sequence edit_weights = tf.get_variable( "edit_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) if is_training: h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) if use_bert_more: # append/replace weight vector for a given append or replace operation # correspond to word embedding for its token argument # for multitoken append/replace (e.g. has been) # weight vector is sum of word embeddings of token arguments append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) replace_weights = append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids, use_one_hot_embeddings, config.vocab_size, config.hidden_size) multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same #(i.e. inserts and multitoken_inserts) append_weights = tf.concat([append_weights, multitoken_append_weights],0) replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0) with tf.variable_scope("loss"): edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper logits = edit_logits if use_bert_more: #=============== inplace_word_logits==============# #2nd term in eq3 in paper inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends]) inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms]) zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces]) concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\ + [inplace_logit_appends]\ + [zero_replace_logits]\ + [inplace_logit_transforms] inplace_word_logits = tf.concat(concat_list,1) #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper zero_5_logits = tf.zeros([batch_size*seq_len,5]) append_logits = tf.matmul(m_append, append_weights, transpose_b=True) if subtract_replaced_from_replacement: replace_logits = replacement_minus_replaced_logits(m_replace, flattened_word_embedded_input, replace_weights) else: replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True) suffix_logits = tf.zeros([batch_size*seq_len,num_suffix_transforms]) concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits] additional_logits = tf.concat(concat_list,1) #====================================================# logits = edit_logits + inplace_word_logits + additional_logits logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits += logits_bias logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels]) log_probs = tf.nn.log_softmax(logits, axis=-1) probs = tf.nn.softmax(logits,axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) per_token_loss = per_token_loss * tf.to_float(input_mask) mask = copy_weight*tf.to_float(tf.equal(labels,3)) + tf.to_float(tf.not_equal(labels,3)) masked_per_token_loss = per_token_loss * mask per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probs)
def body(self, features): hparams = self.hparams if not self.is_training: hparams.dropout_prob = 0.0 with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): # attention_weights: [batch, n_head, from_len, to_len] sequence_output, cls_vector, attention_weights = self.build_encoder( features) if 'targets' not in features: assert self.hparams.dropout_prob == 0.0 logits, losses = self.greedy_decode_8steps(cls_vector, sequence_output) logits.update(attention_weights=attention_weights[:, :, 0, :]) return logits, losses with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): premise = features[ 'targets'] # [batch, premise_len=8] -bad naming:( # [batch, premise_len, hid_size] premise_vecs = premise_gather_nd(sequence_output, premise) batch_size = tf.shape(premise)[0] premise_len = premise.shape.as_list()[-1] theorem = features['theorem'] # batch, 1 # [batch, 1, hid_size] and [num_theorems, hid_size] theorem_vec, theorem_emb_table = modeling.embedding_lookup( input_ids=theorem, # [batch, 1] vocab_size=hparams.num_theorems, embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='theorem_embedding', ) depth = features['depth'] # batch, 1 decoder_input = tf.concat( [ cls_vector, # [batch, 1, hid_size] theorem_vec, # [batch, 1, hid_size] premise_vecs[:, : -1, :] # [batch, premise_len-1, hid_size] ], axis=1) # [batch, premise_len + 1, hid_size] decode_length = decoder_input.shape.as_list()[1] assert decode_length == premise_len + 1 # [decode_length, hid_size] pos_embedding, _ = modeling.embedding_lookup( input_ids=tf.range(decode_length), # [decode_length] vocab_size=hparams.max_premise, # >= premise_len embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='positional_embedding', ) pos_embedding = tf.reshape( pos_embedding, [1, decode_length, hparams.hidden_size]) decoder_input = modeling.layer_norm_and_dropout( decoder_input + # [batch, decode_length, hid_size] pos_embedding, # [1, decode_length, hid_size] hparams.dropout_prob) # [batch, decode_length, hid_size] with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part( rows=decode_length, cols=decode_length, num_lower=-1, # attend to everything before num_upper=0, # attend to nothing after out_shape=[1, decode_length, decode_length ]) # 1, decode_length, decode_length # [batch, decode_length, decode_length] causal_attention_mask = tf.tile(causal_attention_mask, [batch_size, 1, 1]) all_decoder_layers = modeling.transformer_model( input_tensor=decoder_input, attention_mask=causal_attention_mask, hidden_size=hparams.hidden_size, num_hidden_layers=hparams.num_decode_layers, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, intermediate_act_fn=modeling.get_activation( hparams.hidden_act), hidden_dropout_prob=hparams.dropout_prob, attention_probs_dropout_prob=hparams.dropout_prob, initializer_range=hparams.initializer_range, do_return_all_layers=True, attention_top_k=hparams.attention_top_k) decoder_output, _ = all_decoder_layers[ -1] # [batch, dec_len, hid_size] theorem_feature = decoder_output[:, 0, :] # [batch, hid_size] premise_feature = decoder_output[:, 1:, :] # [batch, tar_len, hid_size] with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE): theorem_logits = tf.keras.layers.Dense( # [batch, num_theorems] name='theorem', units=hparams.num_theorems, use_bias=True, kernel_initializer=modeling.create_initializer( hparams.initializer_range))(theorem_feature) premise_logits = tf.matmul( a=premise_feature, # [batch, premise_len, hid_size] b=sequence_output, # [batch, sequence_len, hid_size] transpose_b=True, ) # [batch, premise_len, sequence_len] # [batch * premise_len, sequence_len] seq_len = premise_logits.shape.as_list()[-1] premise_logits = tf.reshape(premise_logits, [-1, seq_len]) premise_weights = tf.cast(premise > 0, tf.float32) # [batch, prem_len] premise_weights = tf.reshape(premise_weights, [-1]) # [batch * prem_len] premise = tf.reshape(premise, [-1, 1]) # [batch * prem_len, 1] theorem_loss = tf.losses.sparse_softmax_cross_entropy( labels=theorem, # [batch, 1] logits=theorem_logits # [batch, num_theorems] ) premise_loss = tf.losses.sparse_softmax_cross_entropy( labels=premise, # [batch * premise_len, 1] logits=premise_logits, # [batch * premise_len, sequence_len] weights=premise_weights # [batch * premise_len] ) logits = dict(theorem_logits=theorem_logits, theorem_labels=theorem, premise_logits=premise_logits, premise_labels=premise) losses = dict(training=theorem_loss + premise_loss, theorem_loss=theorem_loss, premise_loss=premise_loss) return logits, losses
def feed_neural_work(self): ''' input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False''' # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers, self.context_bias = modeling.transformer_model( self.embedded_chars_q, attention_mask=self.attention_mask, hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.config.initializer_range, do_return_all_layers=True, t5_relative_bias=self.t5_att_bias) self.sequence_output = self.all_encoder_layers[-1] with tf.variable_scope("pooler"): if self.transformer_ret_pooling == "mean": print('self.seq_lent:', self.seq_lent) print('tf.reduce_sum(self.sequence_output,axis=1):', tf.reduce_sum(self.sequence_output, axis=1)) self.pooled_output = tf.reduce_sum(self.sequence_output, axis=1) * self.seq_lent elif self.transformer_ret_pooling == "last": self.pooled_output = self.sequence_output[:, -1, :] elif self.transformer_ret_pooling == "max": self.pooled_output = tf.reduce_max(self.sequence_output, axis=1) else: print('wrong transformer_ret_pooling:', self.transformer_ret_pooling) exit(0) #we add dropout for pooled_output if 'adding_problem' not in self.dataset: self.pooled_output = modeling.layer_norm( tf.nn.dropout(self.pooled_output, keep_prob=1.0 - self.input_dropout_prob)) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[self.config.hidden_size, self.max_input_right], initializer=initializer(), ) b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]), name="b") l2_loss = tf.constant(0.0) l2_loss += tf.nn.l2_loss(W) self.scores = tf.nn.xw_plus_b(self.pooled_output, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") if 'adding_problem' not in self.dataset: # Calculate mean cross-entropy loss with tf.name_scope("loss"): self.l2_loss = self.l2_reg_lambda * l2_loss losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) #+ self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") else: with tf.name_scope("loss"): self.l2_loss = self.l2_reg_lambda * l2_loss losses = tf.nn.l2_loss(self.scores - tf.expand_dims(self.input_y, -1)) print('losses:', losses) self.loss = tf.reduce_mean(losses) #+ self.l2_loss with tf.name_scope("accuracy"): correct_predictions = tf.less_equal( tf.abs(self.scores[:, 0] - self.input_y), tf.constant([0.04])) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): # input_tensor:[batch_size, seq_length, hidden_size] # positions:[batch_size, mask_num] # output_weights: [vocab_size, embedding_size] # -> input_tensor:[batch_size*mask_num, hidden_size] """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): # 在输出之前添加一个非线性变换,只在预训练阶段起作用 # new input_tensor:[batch_size*mask_num, hidden_size] input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) # new input_tensor:[batch_size*mask, hidden_size] input_tensor = modeling.layer_norm(input_tensor) tf.logging.info("input tensor shape after transform:{}".format( input_tensor.shape)) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. # output_bias:[vocab_size,] output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) # input_tensor:[batch_size*mask_num, hidden_size] # output weights: [vocab_size, embedding_size=hidden_size] # logits:[batch_size*mask_num, vocab_size] logits = tf.matmul(input_tensor, output_weights, transpose_b=True) # output_bias:[vocab_size] logits = tf.nn.bias_add(logits, output_bias) # log_probs:[batch_size*mask_num, vocab_size] log_probs = tf.nn.log_softmax(logits, axis=-1) #label_ids:[batch_size, mask_num] #new label_ids:[batch_size*mask_num, 1] label_ids = tf.reshape(label_ids, [-1]) #new label_weights:[batch_size*mask_num, 1] label_weights = tf.reshape(label_weights, [-1]) # one_hot_labels:[batch_size*mask_num, vocab_size] one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # log_probs:[batch_size*mask_num, vocab_size] # one_hot_labels:[batch_size*mask_num, vocab_size] # per_example_loss:[batch_size*mask,] per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # cross-entropy loss # 乘以样本权重 #label_weights:[batch_size*mask, 1] numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 # 样本权重归一化后的loss loss = numerator / denominator return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" #bert_config = bert_config, input_tensor = model.get_sequence_output(), output_weights = model.get_embedding_table(), positions = masked_lm_positions, label_ids = masked_lm_ids, label_weights = masked_lm_weights # postions参见create_pretraining_data.py中的masked_lm_postions # label_ids参见create_pretraining_data.py中的masked_lm_labels import ipdb ipdb.set_trace() # 在计算mlm的时候,先得到整个句子的向量,然后从整个句子的向量选出masked的那15%位置的向量,然后计算损失。因此,有%10的mask要保持不变。否则,根本就不会包含正确的masked的单词,因为那其它85%的单词只参与理解,不参与损失函数的计算。 # 有10%的mask要替换可能是为了要提高编码器的纠错能力,因为正常的句子中,也可能粗线错误的单词 # 有80%的呗mask掉主要是锻炼理解能力,能够根据上下文理解当前文本的意思 input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) #input_tensor.shpae=(160,768),output_weights.shape=(21128(vocab_size),768) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) #logits.shape=(160,21128) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) #label_ids.shape = (8,20) label_ids = tf.reshape(label_ids, [-1]) #label_ids.shape = (160) #label_weights.shape=(8,20) label_weights = tf.reshape(label_weights, [-1]) #label_weights是mask的权重, #在本程序中,都是1 #label_weights.shape=(160,) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) #one_hot_labels.shape=(160,21128),一共160个字符,每个字符用vocab_size的 #one_hot表示,为下文求loss做准备。 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def main(args): bert_config = modeling.BertConfig.from_json_file(args.config) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 batch_size = args.batch_size avg_seq_len = args.avg_seq_length max_seq_len = args.max_seq_length tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32 # fake input array length input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len, high=max_seq_len + 1, size=(batch_size), dtype=np.int32) valid_word_num = sum(input_len) # fake input id and mask input_ids = np.random.randint(low=0, high=bert_config.vocab_size, size=(batch_size, max_seq_len), dtype=np.int32) input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32) for b_idx, s_len in enumerate(input_len): input_mask[b_idx][:s_len] = 1 input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32) input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32) # fake embedding output embed_output = np.random.randn(batch_size, max_seq_len, bert_config.hidden_size) input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype) # keep attention_mask for compatible reason att_mask = np.tile(input_mask, max_seq_len) att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len) attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype) # input info valid_word_num = sum(input_len) print("Valid word num : {}/{}, avg sequence length : {:.6} ".format( valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size)) # bert with standard transformer std_bert = modeling.transformer_model( input_tensor=input_tensor, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=False) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: # init weights sess.run(tf.global_variables_initializer()) # get transformer weights all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) transformer_vars = [v for v in all_vars if v.name.startswith('layer')] weights_value = sess.run(transformer_vars) # bert with effective transformer et_bert = effective_transformer.get_sequence_output( max_batch_size=batch_size, max_seq_length=max_seq_len, config=bert_config, attention_mask=attention_mask, input_mask=input_mask_tensor, from_tensor=input_tensor, weights_value=weights_value, ) # diff val1 = sess.run(std_bert).reshape(-1, 768) val2 = sess.run(et_bert).reshape(-1, 768) diff = [] for b_idx, s_len in enumerate(input_len): for w_idx in range(s_len): idx = b_idx * args.max_seq_length + w_idx diff.append(np.fabs(val1[idx] - val2[idx]).max()) print("max diff : {:.6}, avg diff : {:.6}.".format( max(diff), sum(diff) / len(diff))) def time_inference(output_tensor): iter_num = 128 # warm up for i in range(10): sess.run(output_tensor) beg = datetime.now() for i in range(iter_num): sess.run(output_tensor) end = datetime.now() return (end - beg).total_seconds() * 1000 / iter_num # ms print("xla cost : {:.6} ms".format(time_inference(std_bert))) print("et cost : {:.6} ms".format(time_inference(et_bert)))
return model_fn def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1])
def create_model(self, model_input, vocab_size, num_frames, mix_number=None, cluster_size=None, hidden_size=None, is_training=True, groups=None, expansion=None, drop_rate=None, gating_reduction=None, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) config = copy.deepcopy(config) config.num_hidden_layers = FLAGS.bert_hidden_layer config.num_attention_heads = FLAGS.bert_attention_heads config.hidden_dropout_prob = FLAGS.bert_dropout_prob config.attention_probs_dropout_prob = FLAGS.bert_dropout_prob if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 #breakpoint() with tf.variable_scope("encoder"): self.all_encoder_layers = modeling.transformer_model( input_tensor=model_input, attention_mask=None, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) model_input = self.all_encoder_layers[-1] if FLAGS.sample_random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, FLAGS.iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, FLAGS.iterations) cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size gating_reduction = gating_reduction or FLAGS.gating_reduction groups = groups or FLAGS.groups drop_rate = drop_rate or FLAGS.drop_rate mix_number = mix_number or FLAGS.mix_number expansion = expansion or FLAGS.expansion max_frames = model_input.get_shape().as_list()[1] mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32) ftr_mean = tf.reduce_mean(model_input, axis=-1) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=True, is_training=is_training, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, mix_number, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) results = [] for n in range(mix_number): with tf.variable_scope("branch_%d" % n): res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024], audio_ftr=model_input[:, :, 1024:], vocab_size=vocab_size, max_frames=max_frames, cluster_size=cluster_size, groups=groups, expansion=expansion, drop_rate=drop_rate, hidden1_size=hidden1_size, is_training=is_training, gating_reduction=gating_reduction, mask=mask, **unused_params) results.append(res) aux_preds = [res["predictions"] for res in results] logits = [res["logits"] for res in results] logits = tf.stack(logits, axis=1) mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1), logits), axis=1) pred = tf.nn.sigmoid(mix_logit) if is_training: rank_pred = tf.expand_dims(tf.nn.softmax(tf.div( mix_logit, FLAGS.cl_temperature), axis=-1), axis=1) aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.cl_temperature), axis=-1) epsilon = 1e-8 kl_loss = tf.reduce_sum(rank_pred * (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)), axis=-1) regularization_loss = FLAGS.cl_lambda * tf.reduce_mean( tf.reduce_sum(kl_loss, axis=-1), axis=-1) return { "predictions": pred, "regularization_loss": regularization_loss, "aux_predictions": aux_preds } else: return {"predictions": pred}
def get_masked_span_output(bert_config, input_tensor, input_mask, positions, start_labels, end_labels, label_weights): """Get loss and log probs for the recurring span masking.""" sequence_shape = modeling.get_shape_list(input_tensor, expected_rank=3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] num_positions = modeling.get_shape_list(positions, expected_rank=2)[1] query_tensor = gather_indexes(input_tensor, positions) # [batch_size * num_positions, width] with tf.variable_scope("cls/span_predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("query_start_transform"): query_start_tensor = tf.layers.dense( query_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) query_start_tensor = modeling.layer_norm(query_start_tensor) with tf.variable_scope("query_end_transform"): query_end_tensor = tf.layers.dense( query_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) query_end_tensor = modeling.layer_norm(query_end_tensor) with tf.variable_scope("start_transform"): start_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) start_tensor = modeling.layer_norm(start_tensor) with tf.variable_scope("end_transform"): end_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) end_tensor = modeling.layer_norm(end_tensor) start_classifier = tf.get_variable( "start_classifier", shape=[bert_config.hidden_size, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) end_classifier = tf.get_variable( "end_classifier", shape=[bert_config.hidden_size, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) input_mask = tf.expand_dims(input_mask, axis=1) # [batch_size, 1, seq_length] adder = (1.0 - tf.cast(input_mask, tf.float32)) * -10000.0 temp = tf.matmul(query_start_tensor, start_classifier) # [batch_size * num_positions, width] temp = tf.reshape(temp, [batch_size, num_positions, width]) # [batch_size, num_positions, width] start_tensor = tf.transpose(start_tensor, perm=[0, 2, 1]) # [batch_size, width, seq_length] start_logits = tf.matmul(temp, start_tensor) # [batch_size, num_positions, seq_length] start_logits += adder start_logits = tf.reshape(start_logits, [batch_size * num_positions, seq_length]) temp = tf.matmul(query_end_tensor, end_classifier) # [batch_size * num_positions, width] temp = tf.reshape(temp, [batch_size, num_positions, width]) # [batch_size, num_positions, width] end_tensor = tf.transpose(end_tensor, perm=[0, 2, 1]) # [batch_size, width, seq_length] end_logits = tf.matmul(temp, end_tensor) # [batch_size, num_positions, seq_length] end_logits += adder end_logits = tf.reshape(end_logits, [batch_size * num_positions, seq_length]) label_weights = tf.reshape(label_weights, [-1]) # [batch_size * num_positions] start_log_probs = tf.nn.log_softmax(start_logits, axis=-1) # [batch_size * num_positions, seq_length] start_labels = tf.reshape(start_labels, [-1]) # [batch_size * num_positions] start_one_hot_labels = tf.one_hot( start_labels, depth=seq_length, dtype=tf.float32) # # [batch_size * num_positions, seq_length] start_per_example_loss = -tf.reduce_sum(start_log_probs * start_one_hot_labels, axis=[-1]) end_log_probs = tf.nn.log_softmax(end_logits, axis=-1) # [batch_size * num_positions, seq_length] end_labels = tf.reshape(end_labels, [-1]) # [batch_size * num_positions] end_one_hot_labels = tf.one_hot( end_labels, depth=seq_length, dtype=tf.float32) # # [batch_size * num_positions, seq_length] end_per_example_loss = -tf.reduce_sum(end_log_probs * end_one_hot_labels, axis=[-1]) per_example_loss = (start_per_example_loss + end_per_example_loss) / 2 numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return loss, per_example_loss
def get_masked_lm_output(bert_config, input_tensor, output_weights, output_type_weights, positions, label_ids, masked_type_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. output_bias_type = tf.get_variable("output_bias_type", shape=[bert_config.vocab_type_size], initializer=tf.zeros_initializer()) logits_type = tf.matmul(input_tensor, output_type_weights, transpose_b=True) logits_type = tf.nn.bias_add(logits_type, output_bias_type) log_probs_type = tf.nn.log_softmax(logits_type, axis=-1) type_label_ids = tf.reshape(masked_type_ids, [-1]) type_label_weights = tf.reshape(label_weights, [-1]) type_pre = tf.reshape(tf.argmax(log_probs_type, -1), [-1, 1]) one_hot_type_labels = tf.one_hot(type_label_ids, depth=bert_config.vocab_type_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. type_per_example_loss = -tf.reduce_sum( log_probs_type * one_hot_type_labels, axis=[-1]) type_numerator = tf.reduce_sum(type_label_weights * type_per_example_loss) type_denominator = tf.reduce_sum(type_label_weights) + 1e-5 type_loss = type_numerator / type_denominator (type_pre_embedding_output, _) = modeling.embedding_lookup( input_ids=type_pre, vocab_size=bert_config.vocab_type_size, embedding_size=bert_config.hidden_size, initializer_range=bert_config.initializer_range, word_embedding_name="type_word_embeddings", use_one_hot_embeddings=FLAGS.use_tpu, scope="bert/embeddings", reuse=True) with tf.variable_scope("cls/predictions/addtype"): # input_tensor = input_tensor + type_pre_embedding_output concat_input_tensor = tf.layers.dense( tf.concat([input_tensor, tf.squeeze(type_pre_embedding_output)], -1), units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(concat_input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, type_loss, per_example_loss, log_probs)