Example #1
0
def get_masked_lm_output(config,
                         input_tensor,
                         output_weights,
                         positions,
                         label_ids,
                         label_weights,
                         reuse=None):
    """Get loss and log probs for the masked LM."""
    input_tensor = tf.cast(input_tensor, tf.float32)
    positions = tf.cast(positions, tf.int32)
    label_ids = tf.cast(label_ids, tf.int32)
    label_weights = tf.cast(label_weights, tf.float32)

    input_tensor = bert_utils.gather_indexes(input_tensor, positions)
    """
	flatten masked lm ids with positions
	"""
    with tf.variable_scope("cls/predictions", reuse=reuse):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=config.hidden_size,
                activation=bert_modules.get_activation(config.hidden_act),
                kernel_initializer=bert_modules.create_initializer(
                    config.initializer_range))
            input_tensor = bert_modules.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        # one_hot_labels = tf.one_hot(
        # 		label_ids, depth=config.vocab_size, dtype=tf.float32)

        per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=label_ids, logits=logits)

        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        # per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
        # numerator = tf.reduce_sum(label_weights * per_example_loss)
        # denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Example #2
0
def multi_position_crf_classifier(config, features, model_dict, num_labels,
                                  dropout_prob):

    batch_size = features['batch_size']
    total_length_a = features['total_length_a']
    total_length_b = features['total_length_b']

    sequence_output_a = model_dict["a"].get_sequence_output(
    )  # [batch x 10, 130, 768]
    shape_lst = bert_utils.get_shape_list(sequence_output_a, expected_rank=3)

    sequence_output_a = tf.reshape(
        sequence_output_a,
        [-1, total_length_a, shape_lst[-1]])  # [batch, 10 x 130, 768]
    answer_pos = tf.cast(features['label_positions'], tf.int32)
    sequence_output_a = bert_utils.gather_indexes(
        sequence_output_a, answer_pos)  # [batch*10, 768]

    sequence_output_a = tf.reshape(
        sequence_output_a, [-1, config.max_predictions_per_seq, shape_lst[-1]
                            ])  # [batch, 10, 768]

    sequence_output_b = model_dict["b"].get_pooled_output()  # [batch x 10,768]
    sequence_output_b = tf.reshape(
        sequence_output_b, [-1, num_labels, shape_lst[-1]])  # [batch, 10, 768]
    seq_b_shape = bert_utils.get_shape_list(sequence_output_b, expected_rank=3)

    cross_matrix = tf.get_variable(
        "output_weights", [shape_lst[-1], shape_lst[-1]],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    # batch x 10 x 768
    sequence_output_a_proj = tf.einsum("abc,cd->abd", sequence_output_a,
                                       cross_matrix)

    # batch x 10 x 768. batch x 10 x 768
    # batch x 10(ans_pos) x 11(ans_field)
    logits = tf.einsum("abd,acd->abc", sequence_output_a_proj,
                       sequence_output_b)
    logits = tf.multiply(
        logits, 1.0 / tf.math.sqrt(tf.cast(shape_lst[-1], tf.float32)))

    # print(sequence_output_a.get_shape(), sequence_output_b.get_shape(), logits.get_shape())

    # label_ids = tf.cast(features['label_ids'], tf.int32)
    # label_weights = tf.cast(features['label_weights'], tf.int32)
    # label_seq_length = tf.reduce_sum(label_weights, axis=-1)

    # transition = zero_transition(seq_b_shape)

    # log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
    # 										inputs=logits,
    # 										tag_indices=label_ids,
    # 										sequence_lengths=label_seq_length,
    # 										transition_params=transition)

    # transition_params = tf.stop_gradient(transition_params)
    # per_example_loss = -log_likelihood
    # loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits, transition_params)
Example #3
0
def get_masked_lm_output(config, input_tensor, output_weights, positions,
							label_ids, label_weights,
							**kargs):

	reuse = kargs.get('reuse', False)
	embedding_projection = kargs.get('embedding_projection', None)
	"""Get loss and log probs for the masked LM."""
	input_tensor = tf.cast(input_tensor, tf.float32)
	positions = tf.cast(positions, tf.int32)
	label_ids = tf.cast(label_ids, tf.int32)
	label_weights = tf.cast(label_weights, tf.float32)

	input_tensor = bert_utils.gather_indexes(input_tensor, positions)
	"""
	flatten masked lm ids with positions
	"""

	scope = kargs.get('scope', None)
	if scope:
		scope = scope + '/' + 'cls/predictions'
	else:
		scope = 'cls/predictions'

	tf.logging.info("**** mlm scope **** %s", str(scope))

	# with tf.variable_scope("cls/predictions", reuse=reuse):
	with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
		# We apply one more non-linear transformation before the output layer.
		# This matrix is not used after pre-training.
		if config.get('ln_type', 'postln') == 'preln':
			input_tensor = albert_modules.layer_norm(input_tensor)
		elif config.get('ln_type', 'postln') == 'postln':
			input_tensor = input_tensor
		else:
			input_tensor = input_tensor

		if config.get("embedding", "factorized") == "factorized":
			projection_width = config.hidden_size
		else:
			projection_width = config.embedding_size

		with tf.variable_scope("transform"):
			input_tensor = tf.layers.dense(
					input_tensor,
					units=projection_width,
					activation=albert_modules.get_activation(config.hidden_act),
					kernel_initializer=albert_modules.create_initializer(
							config.initializer_range))
			if config.get('ln_type', 'postln') == 'preln':
				input_tensor = input_tensor
			elif config.get('ln_type', 'postln') == 'postln':
				input_tensor = albert_modules.layer_norm(input_tensor)
			else:
				input_tensor = albert_modules.layer_norm(input_tensor)

		if embedding_projection is not None:
			input_tensor = tf.matmul(input_tensor, 
								embedding_projection,
								transpose_b=True)
		else:
			print("==no need for embedding projection==")
			input_tensor = input_tensor

		# The output weights are the same as the input embeddings, but there is
		# an output-only bias for each token.
		output_bias = tf.get_variable(
				"output_bias",
				shape=[config.vocab_size],
				initializer=tf.zeros_initializer())
		logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
		# logits = tf.multiply(logits,
		# 						1.0 / math.sqrt(float(config.hidden_size)))
		# logits *= 2
		
		logits = tf.nn.bias_add(logits, output_bias)
		log_probs = tf.nn.log_softmax(logits, axis=-1)

		label_ids = tf.reshape(label_ids, [-1])
		label_weights = tf.reshape(label_weights, [-1])

		# one_hot_labels = tf.one_hot(
		# 		label_ids, depth=config.vocab_size, dtype=tf.float32)

		per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
													labels=tf.stop_gradient(label_ids),
													logits=logits)
		# per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])

		numerator = tf.reduce_sum(label_weights * per_example_loss)
		denominator = tf.reduce_sum(label_weights) + 1e-5

		# The `positions` tensor might be zero-padded (if the sequence is too
		# short to have the maximum number of predictions). The `label_weights`
		# tensor has a value of 1.0 for every real prediction and 0.0 for the
		# padding predictions.
		# per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
		# numerator = tf.reduce_sum(label_weights * per_example_loss)
		# denominator = tf.reduce_sum(label_weights) + 1e-5
		loss = numerator / denominator

	return (loss, per_example_loss, log_probs, label_weights)
Example #4
0
def multi_position_classifier(config, features, sequence_output, num_labels,
                              dropout_prob):

    final_hidden_shape = bert_utils.get_shape_list(sequence_output,
                                                   expected_rank=3)

    print(final_hidden_shape, "====multi-choice shape====")

    answer_pos = tf.cast(features['label_positions'], tf.int32)
    cls_pos = tf.zeros_like(answer_pos)
    input_tensor = bert_utils.gather_indexes(sequence_output, answer_pos)
    cls_tensor = bert_utils.gather_indexes(sequence_output, cls_pos)

    answer_cls_tensor = tf.concat([cls_tensor, input_tensor], axis=-1)

    input_tensor = tf.layers.dense(
        answer_cls_tensor,
        units=config.hidden_size,
        activation=bert_modules.get_activation(config.hidden_act),
        kernel_initializer=bert_modules.create_initializer(
            config.initializer_range))
    input_tensor = bert_modules.layer_norm(input_tensor)

    output_weights = tf.get_variable(
        "output_weights", [num_labels, final_hidden_shape[-1]],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias",
                                  shape=[num_labels],
                                  initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    label_ids = tf.reshape(tf.cast(features['label_ids'], tf.int32), [-1])
    label_weights = tf.reshape(tf.cast(features['label_weights'], tf.float32),
                               [-1])

    if config.get('class_weights', None):
        class_weights = tf.constant(
            np.array(config.class_weights).astype(np.float32))

    if config.get("loss", "entropy") == "focal_loss":
        per_example_loss, _ = loss_utils.focal_loss_multi_v1(
            config, logits=logits, labels=tf.stop_gradient(label_ids))
    elif config.get("loss", "smoothed_ce") == 'smoothed_ce':
        per_example_loss = loss_utils.ce_label_smoothing(
            config, logits=logits, labels=tf.stop_gradient(label_ids))
    elif config.get('loss', 'class_balanced_focal') == 'class_balanced_focal':
        per_example_loss, _ = loss_utils.class_balanced_focal_loss_multi_v1(
            config,
            logits=logits,
            labels=label_ids,
            label_weights=class_weights)
    else:
        per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=tf.stop_gradient(label_ids), logits=logits)

    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

    return (loss, per_example_loss, logits)