def get_masked_lm_output(config, input_tensor, output_weights, positions, label_ids, label_weights, reuse=None): """Get loss and log probs for the masked LM.""" input_tensor = tf.cast(input_tensor, tf.float32) positions = tf.cast(positions, tf.int32) label_ids = tf.cast(label_ids, tf.int32) label_weights = tf.cast(label_weights, tf.float32) input_tensor = bert_utils.gather_indexes(input_tensor, positions) """ flatten masked lm ids with positions """ with tf.variable_scope("cls/predictions", reuse=reuse): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=config.hidden_size, activation=bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer( config.initializer_range)) input_tensor = bert_modules.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) # one_hot_labels = tf.one_hot( # label_ids, depth=config.vocab_size, dtype=tf.float32) per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=label_ids, logits=logits) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # numerator = tf.reduce_sum(label_weights * per_example_loss) # denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def multi_position_crf_classifier(config, features, model_dict, num_labels, dropout_prob): batch_size = features['batch_size'] total_length_a = features['total_length_a'] total_length_b = features['total_length_b'] sequence_output_a = model_dict["a"].get_sequence_output( ) # [batch x 10, 130, 768] shape_lst = bert_utils.get_shape_list(sequence_output_a, expected_rank=3) sequence_output_a = tf.reshape( sequence_output_a, [-1, total_length_a, shape_lst[-1]]) # [batch, 10 x 130, 768] answer_pos = tf.cast(features['label_positions'], tf.int32) sequence_output_a = bert_utils.gather_indexes( sequence_output_a, answer_pos) # [batch*10, 768] sequence_output_a = tf.reshape( sequence_output_a, [-1, config.max_predictions_per_seq, shape_lst[-1] ]) # [batch, 10, 768] sequence_output_b = model_dict["b"].get_pooled_output() # [batch x 10,768] sequence_output_b = tf.reshape( sequence_output_b, [-1, num_labels, shape_lst[-1]]) # [batch, 10, 768] seq_b_shape = bert_utils.get_shape_list(sequence_output_b, expected_rank=3) cross_matrix = tf.get_variable( "output_weights", [shape_lst[-1], shape_lst[-1]], initializer=tf.truncated_normal_initializer(stddev=0.02)) # batch x 10 x 768 sequence_output_a_proj = tf.einsum("abc,cd->abd", sequence_output_a, cross_matrix) # batch x 10 x 768. batch x 10 x 768 # batch x 10(ans_pos) x 11(ans_field) logits = tf.einsum("abd,acd->abc", sequence_output_a_proj, sequence_output_b) logits = tf.multiply( logits, 1.0 / tf.math.sqrt(tf.cast(shape_lst[-1], tf.float32))) # print(sequence_output_a.get_shape(), sequence_output_b.get_shape(), logits.get_shape()) # label_ids = tf.cast(features['label_ids'], tf.int32) # label_weights = tf.cast(features['label_weights'], tf.int32) # label_seq_length = tf.reduce_sum(label_weights, axis=-1) # transition = zero_transition(seq_b_shape) # log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( # inputs=logits, # tag_indices=label_ids, # sequence_lengths=label_seq_length, # transition_params=transition) # transition_params = tf.stop_gradient(transition_params) # per_example_loss = -log_likelihood # loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, transition_params)
def get_masked_lm_output(config, input_tensor, output_weights, positions, label_ids, label_weights, **kargs): reuse = kargs.get('reuse', False) embedding_projection = kargs.get('embedding_projection', None) """Get loss and log probs for the masked LM.""" input_tensor = tf.cast(input_tensor, tf.float32) positions = tf.cast(positions, tf.int32) label_ids = tf.cast(label_ids, tf.int32) label_weights = tf.cast(label_weights, tf.float32) input_tensor = bert_utils.gather_indexes(input_tensor, positions) """ flatten masked lm ids with positions """ scope = kargs.get('scope', None) if scope: scope = scope + '/' + 'cls/predictions' else: scope = 'cls/predictions' tf.logging.info("**** mlm scope **** %s", str(scope)) # with tf.variable_scope("cls/predictions", reuse=reuse): with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. if config.get('ln_type', 'postln') == 'preln': input_tensor = albert_modules.layer_norm(input_tensor) elif config.get('ln_type', 'postln') == 'postln': input_tensor = input_tensor else: input_tensor = input_tensor if config.get("embedding", "factorized") == "factorized": projection_width = config.hidden_size else: projection_width = config.embedding_size with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=projection_width, activation=albert_modules.get_activation(config.hidden_act), kernel_initializer=albert_modules.create_initializer( config.initializer_range)) if config.get('ln_type', 'postln') == 'preln': input_tensor = input_tensor elif config.get('ln_type', 'postln') == 'postln': input_tensor = albert_modules.layer_norm(input_tensor) else: input_tensor = albert_modules.layer_norm(input_tensor) if embedding_projection is not None: input_tensor = tf.matmul(input_tensor, embedding_projection, transpose_b=True) else: print("==no need for embedding projection==") input_tensor = input_tensor # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) # logits = tf.multiply(logits, # 1.0 / math.sqrt(float(config.hidden_size))) # logits *= 2 logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) # one_hot_labels = tf.one_hot( # label_ids, depth=config.vocab_size, dtype=tf.float32) per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.stop_gradient(label_ids), logits=logits) # per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # numerator = tf.reduce_sum(label_weights * per_example_loss) # denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs, label_weights)
def multi_position_classifier(config, features, sequence_output, num_labels, dropout_prob): final_hidden_shape = bert_utils.get_shape_list(sequence_output, expected_rank=3) print(final_hidden_shape, "====multi-choice shape====") answer_pos = tf.cast(features['label_positions'], tf.int32) cls_pos = tf.zeros_like(answer_pos) input_tensor = bert_utils.gather_indexes(sequence_output, answer_pos) cls_tensor = bert_utils.gather_indexes(sequence_output, cls_pos) answer_cls_tensor = tf.concat([cls_tensor, input_tensor], axis=-1) input_tensor = tf.layers.dense( answer_cls_tensor, units=config.hidden_size, activation=bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer( config.initializer_range)) input_tensor = bert_modules.layer_norm(input_tensor) output_weights = tf.get_variable( "output_weights", [num_labels, final_hidden_shape[-1]], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) label_ids = tf.reshape(tf.cast(features['label_ids'], tf.int32), [-1]) label_weights = tf.reshape(tf.cast(features['label_weights'], tf.float32), [-1]) if config.get('class_weights', None): class_weights = tf.constant( np.array(config.class_weights).astype(np.float32)) if config.get("loss", "entropy") == "focal_loss": per_example_loss, _ = loss_utils.focal_loss_multi_v1( config, logits=logits, labels=tf.stop_gradient(label_ids)) elif config.get("loss", "smoothed_ce") == 'smoothed_ce': per_example_loss = loss_utils.ce_label_smoothing( config, logits=logits, labels=tf.stop_gradient(label_ids)) elif config.get('loss', 'class_balanced_focal') == 'class_balanced_focal': per_example_loss, _ = loss_utils.class_balanced_focal_loss_multi_v1( config, logits=logits, labels=label_ids, label_weights=class_weights) else: per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.stop_gradient(label_ids), logits=logits) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, logits)