Example #1
0
 def _get_discriminator_output(self, inputs, discriminator, labels):
     """Discriminator binary classifier."""
     with tf.variable_scope("discriminator_predictions"):
         hidden = tf.layers.dense(
             discriminator.get_sequence_output(),
             units=self._bert_config.hidden_size,
             activation=modeling.get_activation(
                 self._bert_config.hidden_act),
             kernel_initializer=modeling.create_initializer(
                 self._bert_config.initializer_range))
         logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1)
         weights = tf.cast(inputs.input_mask, tf.float32)
         labelsf = tf.cast(labels, tf.float32)
         losses = tf.nn.sigmoid_cross_entropy_with_logits(
             logits=logits, labels=labelsf) * weights
         per_example_loss = (tf.reduce_sum(losses, axis=-1) /
                             (1e-6 + tf.reduce_sum(weights, axis=-1)))
         loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
         probs = tf.nn.sigmoid(logits)
         preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
         DiscOutput = collections.namedtuple(
             "DiscOutput",
             ["loss", "per_example_loss", "probs", "preds", "labels"])
         return DiscOutput(
             loss=loss,
             per_example_loss=per_example_loss,
             probs=probs,
             preds=preds,
             labels=labels,
         )
Example #2
0
    def _get_entropy_output(self, inputs: pretrain_data.Inputs, model):
        """Masked language modeling softmax layer."""
        with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE):
            hidden = tf.layers.dense(
                model.get_sequence_output(),
                units=modeling.get_shape_list(model.get_embedding_table())[-1],
                activation=modeling.get_activation(
                    self._bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    self._bert_config.initializer_range))
            hidden = modeling.layer_norm(hidden)
            output_bias = tf.get_variable("output_bias",
                                          shape=[self._bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(hidden,
                               model.get_embedding_table(),
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            probs = tf.nn.softmax(logits)
            log_probs = tf.nn.log_softmax(logits)
            entropy = -tf.reduce_sum(log_probs * probs, axis=[2])

            EntropyOutput = collections.namedtuple(
                "EntropyOutput", ["logits", "probs", "log_probs", "entropy"])
            return EntropyOutput(logits=logits,
                                 probs=probs,
                                 log_probs=log_probs,
                                 entropy=entropy)
Example #3
0
    def _build_teacher(self,
                       states,
                       inputs: pretrain_data.Inputs,
                       is_training,
                       name="teacher",
                       reuse=False,
                       **kwargs):
        """Build teacher network to estimate token score."""
        input_shape = get_shape_list(states, expected_rank=3)
        prev_output = states
        hidden_size = self._teacher_config.hidden_size
        num_hidden_layers = self._teacher_config.num_hidden_layers
        if is_training:
            hidden_dropout_prob = self._teacher_config.hidden_dropout_prob
        else:
            hidden_dropout_prob = 0.0
        with tf.variable_scope("teacher", reuse=reuse):
            for layer_idx in range(num_hidden_layers):
                with tf.variable_scope("layer_%d" % layer_idx):
                    layer_input = prev_output
                    layer_output = tf.layers.dense(
                        layer_input,
                        hidden_size,
                        activation=get_activation("gelu"),
                        kernel_initializer=create_initializer(
                            self._teacher_config.initializer_range))
                    layer_output = dropout(layer_output, hidden_dropout_prob)
                    layer_output = layer_norm(layer_output)
                    prev_output = layer_output

            sequence_output = prev_output
            with tf.variable_scope("bernoulli"):
                with tf.variable_scope("transform"):
                    logits = tf.layers.dense(
                        sequence_output,
                        units=1,
                        kernel_initializer=create_initializer(
                            self._teacher_config.initializer_range))
                    action_probs = tf.nn.sigmoid(logits)
                    action_probs = tf.squeeze(action_probs)

            TeacherOutput = collections.namedtuple("TeacherOutput",
                                                   ["action_probs"])
        return TeacherOutput(action_probs=action_probs)
Example #4
0
    def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model):
        """Masked language modeling softmax layer."""
        masked_lm_weights = inputs.masked_lm_weights
        with tf.variable_scope("generator_predictions"):
            if self._config.uniform_generator:
                logits = tf.zeros(self._bert_config.vocab_size)
                logits_tiled = tf.zeros(
                    modeling.get_shape_list(inputs.masked_lm_ids) +
                    [self._bert_config.vocab_size])
                logits_tiled += tf.reshape(
                    logits, [1, 1, self._bert_config.vocab_size])
                logits = logits_tiled
            else:
                relevant_hidden = pretrain_helpers.gather_positions(
                    model.get_sequence_output(), inputs.masked_lm_positions)
                hidden = tf.layers.dense(
                    relevant_hidden,
                    units=modeling.get_shape_list(
                        model.get_embedding_table())[-1],
                    activation=modeling.get_activation(
                        self._bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self._bert_config.initializer_range),
                )
                hidden = modeling.layer_norm(hidden)
                output_bias = tf.get_variable(
                    "output_bias",
                    shape=[self._bert_config.vocab_size],
                    initializer=tf.zeros_initializer())
                logits_embed = tf.matmul(hidden,
                                         model.get_embedding_table(),
                                         transpose_b=True)
                logits = tf.nn.bias_add(logits_embed, output_bias)

            oh_labels = tf.one_hot(inputs.masked_lm_ids,
                                   depth=self._bert_config.vocab_size,
                                   dtype=tf.float32)

            probs = tf.nn.softmax(logits)
            log_probs = tf.nn.log_softmax(logits)
            label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)

            numerator = tf.reduce_sum(inputs.masked_lm_weights *
                                      label_log_probs)
            denominator = tf.reduce_sum(masked_lm_weights) + 1e-6
            loss = numerator / denominator
            preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)

            MLMOutput = collections.namedtuple(
                "MLMOutput",
                ["logits", "probs", "loss", "per_example_loss", "preds"])
            return MLMOutput(logits=logits,
                             probs=probs,
                             per_example_loss=label_log_probs,
                             loss=loss,
                             preds=preds), logits_embed
def get_masked_regression_output(bert_config, input_tensor, positions,
                                 label_values, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/regression"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        output_weights = tf.get_variable(
            "output_weights",
            shape=[1, bert_config.hidden_size],
            initializer=modeling.create_initializer(
                bert_config.initializer_range))
        output_bias = tf.get_variable("output_bias",
                                      shape=[1],
                                      initializer=tf.zeros_initializer())
        outputs = tf.matmul(input_tensor, output_weights, transpose_b=True)
        outputs = tf.nn.bias_add(outputs, output_bias)

        output_values = tf.reshape(outputs, [-1])
        label_values = tf.reshape(label_values, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        per_example_loss = (output_values - label_values)**2
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, outputs)
Example #6
0
def get_token_logits(input_reprs, embedding_table, bert_config):
    hidden = tf.layers.dense(
        input_reprs,
        units=modeling.get_shape_list(embedding_table)[-1],
        activation=modeling.get_activation(bert_config.hidden_act),
        kernel_initializer=modeling.create_initializer(
            bert_config.initializer_range))
    hidden = modeling.layer_norm(hidden)
    output_bias = tf.get_variable("output_bias",
                                  shape=[bert_config.vocab_size],
                                  initializer=tf.zeros_initializer())
    logits = tf.matmul(hidden, embedding_table, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    return logits
Example #7
0
    def _get_discriminator_output(self,
                                  inputs,
                                  discriminator,
                                  labels,
                                  cloze_output=None):
        """Discriminator binary classifier."""
        with tf.variable_scope("discriminator_predictions"):
            with tf.tpu.bfloat16_scope():
                hidden = tf.layers.dense(
                    discriminator.get_sequence_output(),
                    units=self._bert_config.hidden_size,
                    activation=modeling.get_activation(
                        self._bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self._bert_config.initializer_range))
                logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1)
            logits = tf.cast(logits, dtype=tf.float32)
            if self._config.electric_objective:
                log_q = tf.reduce_sum(
                    tf.nn.log_softmax(cloze_output.logits) *
                    tf.one_hot(inputs.input_ids,
                               depth=self._bert_config.vocab_size,
                               dtype=tf.float32), -1)
                log_q = tf.stop_gradient(log_q)
                logits += log_q
                logits += tf.log(self._config.mask_prob /
                                 (1 - self._config.mask_prob))

            weights = tf.cast(inputs.input_mask, tf.float32)
            labelsf = tf.cast(labels, tf.float32)
            losses = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits, labels=labelsf) * weights
            per_example_loss = (tf.reduce_sum(losses, axis=-1) /
                                (1e-6 + tf.reduce_sum(weights, axis=-1)))
            loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
            probs = tf.nn.sigmoid(logits)
            preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
            DiscOutput = collections.namedtuple(
                "DiscOutput",
                ["loss", "per_example_loss", "probs", "preds", "labels"])
            return DiscOutput(
                loss=loss,
                per_example_loss=per_example_loss,
                probs=probs,
                preds=preds,
                labels=labels,
            )
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Example #9
0
    def _get_autoencoder_output(self, inputs: pretrain_data.Inputs, model):
        """Auto-Encoder softmax layer."""
        with tf.variable_scope("autoencoder_predictions"):
            relevant_hidden = model.get_sequence_output()
            hidden = tf.layers.dense(
                relevant_hidden,
                units=modeling.get_shape_list(model.get_embedding_table())[-1],
                activation=modeling.get_activation(
                    self._bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    self._bert_config.initializer_range))
            hidden = modeling.layer_norm(hidden)
            output_bias = tf.get_variable("output_bias",
                                          shape=[self._bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(hidden,
                               model.get_embedding_table(),
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            oh_labels = tf.one_hot(inputs.input_ids,
                                   depth=self._bert_config.vocab_size,
                                   dtype=tf.float32)

            probs = tf.nn.softmax(logits)
            log_probs = tf.nn.log_softmax(logits)
            label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)

            numerator = tf.reduce_sum(inputs.input_mask * label_log_probs)
            denominator = tf.reduce_sum(inputs.input_mask) + 1e-6
            loss = numerator / denominator
            preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)

            AEOutput = collections.namedtuple(
                "AEOutput",
                ["logits", "probs", "loss", "per_example_loss", "preds"])
            return AEOutput(logits=logits,
                            probs=probs,
                            per_example_loss=label_log_probs,
                            loss=loss,
                            preds=preds)
def get_next_sentence_output(bert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, bert_config.hidden_size],
        initializer=modeling.create_initializer(bert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)
Example #11
0
 def _get_discriminator_output(self, inputs, discriminator, labels):
     """Discriminator binary classifier."""
     with tf.variable_scope("discriminator_predictions"):
         hidden = tf.layers.dense(
             discriminator.get_sequence_output(
             ),  # 即discriminator最后一层hidden state,用此来做sigmoid二分类
             units=self._bert_config.hidden_size,
             activation=modeling.get_activation(
                 self._bert_config.hidden_act),
             kernel_initializer=modeling.create_initializer(
                 self._bert_config.initializer_range))
         logits = tf.squeeze(
             tf.layers.dense(hidden, units=1),
             -1)  # 其实这个discriminator的输出后面又接了两个dense层,然后再将结果输入sigmoid层
         weights = tf.cast(inputs.input_mask,
                           tf.float32)  # 看看这里到底是咋回事,weight是怎么来的
         labelsf = tf.cast(labels, tf.float32)
         losses = tf.nn.sigmoid_cross_entropy_with_logits(  # 注意这里!笑了,原来计算交叉熵loss直接输入原始x就够了,而不是输入算好的sigmoid(x)!!
             logits=logits,
             labels=labelsf) * weights
         per_example_loss = (tf.reduce_sum(losses, axis=-1) /
                             (1e-6 + tf.reduce_sum(weights, axis=-1)))
         loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
         probs = tf.nn.sigmoid(
             logits)  # 这里是sigmoid层,input logits可看作NN的final hidden state
         preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2),
                         tf.int32)  # 进行0.5的结果切分——大于0.5输出1,小于0.5输出0
         DiscOutput = collections.namedtuple(
             "DiscOutput",
             ["loss", "per_example_loss", "probs", "preds", "labels"])
         return DiscOutput(
             loss=loss,
             per_example_loss=per_example_loss,
             probs=probs,
             preds=preds,
             labels=labels,
         )
  def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model):
    """Masked language modeling softmax layer."""
    masked_lm_weights = inputs.masked_lm_weights
    with tf.variable_scope("generator_predictions"):
      if self._config.uniform_generator or self._config.identity_generator or self._config.heuristic_generator:
        logits = tf.zeros(self._bert_config.vocab_size)
        logits_tiled = tf.zeros(
            modeling.get_shape_list(inputs.masked_lm_ids) +
            [self._bert_config.vocab_size])
        logits_tiled += tf.reshape(logits, [1, 1, self._bert_config.vocab_size])
        logits = logits_tiled
      else:
        relevant_hidden = pretrain_helpers.gather_positions(
            model.get_sequence_output(), inputs.masked_lm_positions)
        hidden = tf.layers.dense(
            relevant_hidden,
            units=modeling.get_shape_list(model.get_embedding_table())[-1],
            activation=modeling.get_activation(self._bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                self._bert_config.initializer_range))
        hidden = modeling.layer_norm(hidden)
        output_bias = tf.get_variable(
            "output_bias",
            shape=[self._bert_config.vocab_size],
            initializer=tf.zeros_initializer())
        logits = tf.matmul(hidden, model.get_embedding_table(),
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

      oh_labels = tf.one_hot(
          inputs.masked_lm_ids, depth=self._bert_config.vocab_size,
          dtype=tf.float32)

      probs = tf.nn.softmax(logits)

      if self._config.identity_generator:
          identity_logits = tf.zeros(self._bert_config.vocab_size)
          identity_logits_tiled = tf.zeros(
              modeling.get_shape_list(inputs.masked_lm_ids) +
              [self._bert_config.vocab_size])
          masked_identity_weights = tf.one_hot(inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32)
          identity_logits_tiled += 25.0 * masked_identity_weights
          identity_logits_tiled += tf.reshape(identity_logits, [1, 1, self._bert_config.vocab_size])
          identity_logits = identity_logits_tiled
          identity_probs = tf.nn.softmax(identity_logits)

          identity_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_identity_weight
          probs = probs * (1 - identity_weight) + identity_probs * identity_weight
          logits = tf.math.log(probs)  # softmax(log(probs)) = probs
      elif self._config.heuristic_generator:
          synonym_logits = tf.zeros(self._bert_config.vocab_size)
          synonym_logits_tiled = tf.zeros(
              modeling.get_shape_list(inputs.masked_lm_ids) +
              [self._bert_config.vocab_size])
          masked_synonym_weights = tf.reduce_sum(
              tf.one_hot(inputs.masked_synonym_ids, depth=self._bert_config.vocab_size, dtype=tf.float32), -2)
          padded_synonym_mask = tf.concat([tf.zeros([1]), tf.ones([self._bert_config.vocab_size - 1])], 0)
          masked_synonym_weights *= tf.expand_dims(tf.expand_dims(padded_synonym_mask, 0), 0)
          synonym_logits_tiled += 25.0 * masked_synonym_weights
          synonym_logits_tiled += tf.reshape(synonym_logits, [1, 1, self._bert_config.vocab_size])
          synonym_logits = synonym_logits_tiled
          synonym_probs = tf.nn.softmax(synonym_logits)

          if self._config.synonym_scheduler_type == 'linear':
              synonym_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_synonym_weight
              probs = probs * (1 - synonym_weight) + synonym_probs * synonym_weight
              logits = tf.math.log(probs)  # softmax(log(probs)) = probs

      log_probs = tf.nn.log_softmax(logits)
      label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)

      numerator = tf.reduce_sum(inputs.masked_lm_weights * label_log_probs)
      denominator = tf.reduce_sum(masked_lm_weights) + 1e-6
      loss = numerator / denominator
      preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)

      MLMOutput = collections.namedtuple(
          "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"])
      return MLMOutput(
          logits=logits, probs=probs, per_example_loss=label_log_probs,
          loss=loss, preds=preds)