def _get_discriminator_output(self, inputs, discriminator, labels): """Discriminator binary classifier.""" with tf.variable_scope("discriminator_predictions"): hidden = tf.layers.dense( discriminator.get_sequence_output(), units=self._bert_config.hidden_size, activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1) weights = tf.cast(inputs.input_mask, tf.float32) labelsf = tf.cast(labels, tf.float32) losses = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labelsf) * weights per_example_loss = (tf.reduce_sum(losses, axis=-1) / (1e-6 + tf.reduce_sum(weights, axis=-1))) loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights)) probs = tf.nn.sigmoid(logits) preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32) DiscOutput = collections.namedtuple( "DiscOutput", ["loss", "per_example_loss", "probs", "preds", "labels"]) return DiscOutput( loss=loss, per_example_loss=per_example_loss, probs=probs, preds=preds, labels=labels, )
def _get_entropy_output(self, inputs: pretrain_data.Inputs, model): """Masked language modeling softmax layer.""" with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE): hidden = tf.layers.dense( model.get_sequence_output(), units=modeling.get_shape_list(model.get_embedding_table())[-1], activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable("output_bias", shape=[self._bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(hidden, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits) log_probs = tf.nn.log_softmax(logits) entropy = -tf.reduce_sum(log_probs * probs, axis=[2]) EntropyOutput = collections.namedtuple( "EntropyOutput", ["logits", "probs", "log_probs", "entropy"]) return EntropyOutput(logits=logits, probs=probs, log_probs=log_probs, entropy=entropy)
def _build_teacher(self, states, inputs: pretrain_data.Inputs, is_training, name="teacher", reuse=False, **kwargs): """Build teacher network to estimate token score.""" input_shape = get_shape_list(states, expected_rank=3) prev_output = states hidden_size = self._teacher_config.hidden_size num_hidden_layers = self._teacher_config.num_hidden_layers if is_training: hidden_dropout_prob = self._teacher_config.hidden_dropout_prob else: hidden_dropout_prob = 0.0 with tf.variable_scope("teacher", reuse=reuse): for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output layer_output = tf.layers.dense( layer_input, hidden_size, activation=get_activation("gelu"), kernel_initializer=create_initializer( self._teacher_config.initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = layer_norm(layer_output) prev_output = layer_output sequence_output = prev_output with tf.variable_scope("bernoulli"): with tf.variable_scope("transform"): logits = tf.layers.dense( sequence_output, units=1, kernel_initializer=create_initializer( self._teacher_config.initializer_range)) action_probs = tf.nn.sigmoid(logits) action_probs = tf.squeeze(action_probs) TeacherOutput = collections.namedtuple("TeacherOutput", ["action_probs"]) return TeacherOutput(action_probs=action_probs)
def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model): """Masked language modeling softmax layer.""" masked_lm_weights = inputs.masked_lm_weights with tf.variable_scope("generator_predictions"): if self._config.uniform_generator: logits = tf.zeros(self._bert_config.vocab_size) logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) logits_tiled += tf.reshape( logits, [1, 1, self._bert_config.vocab_size]) logits = logits_tiled else: relevant_hidden = pretrain_helpers.gather_positions( model.get_sequence_output(), inputs.masked_lm_positions) hidden = tf.layers.dense( relevant_hidden, units=modeling.get_shape_list( model.get_embedding_table())[-1], activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range), ) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable( "output_bias", shape=[self._bert_config.vocab_size], initializer=tf.zeros_initializer()) logits_embed = tf.matmul(hidden, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits_embed, output_bias) oh_labels = tf.one_hot(inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32) probs = tf.nn.softmax(logits) log_probs = tf.nn.log_softmax(logits) label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1) numerator = tf.reduce_sum(inputs.masked_lm_weights * label_log_probs) denominator = tf.reduce_sum(masked_lm_weights) + 1e-6 loss = numerator / denominator preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32) MLMOutput = collections.namedtuple( "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"]) return MLMOutput(logits=logits, probs=probs, per_example_loss=label_log_probs, loss=loss, preds=preds), logits_embed
def get_masked_regression_output(bert_config, input_tensor, positions, label_values, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/regression"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) output_weights = tf.get_variable( "output_weights", shape=[1, bert_config.hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_bias = tf.get_variable("output_bias", shape=[1], initializer=tf.zeros_initializer()) outputs = tf.matmul(input_tensor, output_weights, transpose_b=True) outputs = tf.nn.bias_add(outputs, output_bias) output_values = tf.reshape(outputs, [-1]) label_values = tf.reshape(label_values, [-1]) label_weights = tf.reshape(label_weights, [-1]) per_example_loss = (output_values - label_values)**2 numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, outputs)
def get_token_logits(input_reprs, embedding_table, bert_config): hidden = tf.layers.dense( input_reprs, units=modeling.get_shape_list(embedding_table)[-1], activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(hidden, embedding_table, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def _get_discriminator_output(self, inputs, discriminator, labels, cloze_output=None): """Discriminator binary classifier.""" with tf.variable_scope("discriminator_predictions"): with tf.tpu.bfloat16_scope(): hidden = tf.layers.dense( discriminator.get_sequence_output(), units=self._bert_config.hidden_size, activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1) logits = tf.cast(logits, dtype=tf.float32) if self._config.electric_objective: log_q = tf.reduce_sum( tf.nn.log_softmax(cloze_output.logits) * tf.one_hot(inputs.input_ids, depth=self._bert_config.vocab_size, dtype=tf.float32), -1) log_q = tf.stop_gradient(log_q) logits += log_q logits += tf.log(self._config.mask_prob / (1 - self._config.mask_prob)) weights = tf.cast(inputs.input_mask, tf.float32) labelsf = tf.cast(labels, tf.float32) losses = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labelsf) * weights per_example_loss = (tf.reduce_sum(losses, axis=-1) / (1e-6 + tf.reduce_sum(weights, axis=-1))) loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights)) probs = tf.nn.sigmoid(logits) preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32) DiscOutput = collections.namedtuple( "DiscOutput", ["loss", "per_example_loss", "probs", "preds", "labels"]) return DiscOutput( loss=loss, per_example_loss=per_example_loss, probs=probs, preds=preds, labels=labels, )
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs)
def _get_autoencoder_output(self, inputs: pretrain_data.Inputs, model): """Auto-Encoder softmax layer.""" with tf.variable_scope("autoencoder_predictions"): relevant_hidden = model.get_sequence_output() hidden = tf.layers.dense( relevant_hidden, units=modeling.get_shape_list(model.get_embedding_table())[-1], activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable("output_bias", shape=[self._bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(hidden, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) oh_labels = tf.one_hot(inputs.input_ids, depth=self._bert_config.vocab_size, dtype=tf.float32) probs = tf.nn.softmax(logits) log_probs = tf.nn.log_softmax(logits) label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1) numerator = tf.reduce_sum(inputs.input_mask * label_log_probs) denominator = tf.reduce_sum(inputs.input_mask) + 1e-6 loss = numerator / denominator preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32) AEOutput = collections.namedtuple( "AEOutput", ["logits", "probs", "loss", "per_example_loss", "preds"]) return AEOutput(logits=logits, probs=probs, per_example_loss=label_log_probs, loss=loss, preds=preds)
def get_next_sentence_output(bert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def _get_discriminator_output(self, inputs, discriminator, labels): """Discriminator binary classifier.""" with tf.variable_scope("discriminator_predictions"): hidden = tf.layers.dense( discriminator.get_sequence_output( ), # 即discriminator最后一层hidden state,用此来做sigmoid二分类 units=self._bert_config.hidden_size, activation=modeling.get_activation( self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) logits = tf.squeeze( tf.layers.dense(hidden, units=1), -1) # 其实这个discriminator的输出后面又接了两个dense层,然后再将结果输入sigmoid层 weights = tf.cast(inputs.input_mask, tf.float32) # 看看这里到底是咋回事,weight是怎么来的 labelsf = tf.cast(labels, tf.float32) losses = tf.nn.sigmoid_cross_entropy_with_logits( # 注意这里!笑了,原来计算交叉熵loss直接输入原始x就够了,而不是输入算好的sigmoid(x)!! logits=logits, labels=labelsf) * weights per_example_loss = (tf.reduce_sum(losses, axis=-1) / (1e-6 + tf.reduce_sum(weights, axis=-1))) loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights)) probs = tf.nn.sigmoid( logits) # 这里是sigmoid层,input logits可看作NN的final hidden state preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32) # 进行0.5的结果切分——大于0.5输出1,小于0.5输出0 DiscOutput = collections.namedtuple( "DiscOutput", ["loss", "per_example_loss", "probs", "preds", "labels"]) return DiscOutput( loss=loss, per_example_loss=per_example_loss, probs=probs, preds=preds, labels=labels, )
def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model): """Masked language modeling softmax layer.""" masked_lm_weights = inputs.masked_lm_weights with tf.variable_scope("generator_predictions"): if self._config.uniform_generator or self._config.identity_generator or self._config.heuristic_generator: logits = tf.zeros(self._bert_config.vocab_size) logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) logits_tiled += tf.reshape(logits, [1, 1, self._bert_config.vocab_size]) logits = logits_tiled else: relevant_hidden = pretrain_helpers.gather_positions( model.get_sequence_output(), inputs.masked_lm_positions) hidden = tf.layers.dense( relevant_hidden, units=modeling.get_shape_list(model.get_embedding_table())[-1], activation=modeling.get_activation(self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable( "output_bias", shape=[self._bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(hidden, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) oh_labels = tf.one_hot( inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32) probs = tf.nn.softmax(logits) if self._config.identity_generator: identity_logits = tf.zeros(self._bert_config.vocab_size) identity_logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) masked_identity_weights = tf.one_hot(inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32) identity_logits_tiled += 25.0 * masked_identity_weights identity_logits_tiled += tf.reshape(identity_logits, [1, 1, self._bert_config.vocab_size]) identity_logits = identity_logits_tiled identity_probs = tf.nn.softmax(identity_logits) identity_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_identity_weight probs = probs * (1 - identity_weight) + identity_probs * identity_weight logits = tf.math.log(probs) # softmax(log(probs)) = probs elif self._config.heuristic_generator: synonym_logits = tf.zeros(self._bert_config.vocab_size) synonym_logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) masked_synonym_weights = tf.reduce_sum( tf.one_hot(inputs.masked_synonym_ids, depth=self._bert_config.vocab_size, dtype=tf.float32), -2) padded_synonym_mask = tf.concat([tf.zeros([1]), tf.ones([self._bert_config.vocab_size - 1])], 0) masked_synonym_weights *= tf.expand_dims(tf.expand_dims(padded_synonym_mask, 0), 0) synonym_logits_tiled += 25.0 * masked_synonym_weights synonym_logits_tiled += tf.reshape(synonym_logits, [1, 1, self._bert_config.vocab_size]) synonym_logits = synonym_logits_tiled synonym_probs = tf.nn.softmax(synonym_logits) if self._config.synonym_scheduler_type == 'linear': synonym_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_synonym_weight probs = probs * (1 - synonym_weight) + synonym_probs * synonym_weight logits = tf.math.log(probs) # softmax(log(probs)) = probs log_probs = tf.nn.log_softmax(logits) label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1) numerator = tf.reduce_sum(inputs.masked_lm_weights * label_log_probs) denominator = tf.reduce_sum(masked_lm_weights) + 1e-6 loss = numerator / denominator preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32) MLMOutput = collections.namedtuple( "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"]) return MLMOutput( logits=logits, probs=probs, per_example_loss=label_log_probs, loss=loss, preds=preds)