Ejemplo n.º 1
0
def get_mlm_logits(model, albert_config, mlm_positions):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range),
            )
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            "output_bias",
            shape=[albert_config.vocab_size],
            initializer=tf.zeros_initializer(),
        )
        logits = tf.matmul(input_tensor,
                           model.get_embedding_table(),
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Ejemplo n.º 2
0
def get_masked_lm_output(
    albert_config,
    input_tensor,
    output_weights,
    positions,
    label_ids,
    label_weights,
):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope('cls/predictions'):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope('transform'):
            input_tensor = tf.layers.dense(
                input_tensor,
                units = albert_config.embedding_size,
                activation = modeling.get_activation(albert_config.hidden_act),
                kernel_initializer = modeling.create_initializer(
                    albert_config.initializer_range
                ),
            )
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            'output_bias',
            shape = [albert_config.vocab_size],
            initializer = tf.zeros_initializer(),
        )
        logits = tf.matmul(input_tensor, output_weights, transpose_b = True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis = -1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(
            label_ids, depth = albert_config.vocab_size, dtype = tf.float32
        )

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            log_probs * one_hot_labels, axis = [-1]
        )
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Ejemplo n.º 3
0
    def __init__(self, bert_config, tokenizer):
        _graph = tf.Graph()
        with _graph.as_default():
            self.X = tf.placeholder(tf.int32, [None, None])
            self.top_p = tf.placeholder(tf.float32, None)
            self.top_k = tf.placeholder(tf.int32, None)
            self.k = tf.placeholder(tf.int32, None)
            self.temperature = tf.placeholder(tf.float32, None)
            self.indices = tf.placeholder(tf.int32, [None, None])
            self.MASK = tf.placeholder(tf.int32, [None, None])
            self._tokenizer = tokenizer

            self.model = modeling.AlbertModel(
                config=bert_config,
                is_training=False,
                input_ids=self.X,
                input_mask=self.MASK,
                use_one_hot_embeddings=False,
            )
            self.logits = self.model.get_pooled_output()
            input_tensor = self.model.get_sequence_output()
            output_weights = self.model.get_embedding_table()

            with tf.variable_scope('cls/predictions'):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        input_tensor,
                        units=bert_config.embedding_size,
                        activation=modeling.get_activation(
                            bert_config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            bert_config.initializer_range),
                    )
                    input_tensor = modeling.layer_norm(input_tensor)

                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[bert_config.vocab_size],
                    initializer=tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor,
                                   output_weights,
                                   transpose_b=True)
                self._logits = tf.nn.bias_add(logits, output_bias)
                self._log_softmax = tf.nn.log_softmax(self._logits, axis=-1)

            logits = tf.gather_nd(self._logits, self.indices)
            logits = logits / self.temperature

            def necleus():
                return top_p_logits(logits, self.top_p)

            def select_k():
                return top_k_logits(logits, self.top_k)

            logits = tf.cond(self.top_p > 0, necleus, select_k)
            self.samples = tf.multinomial(logits,
                                          num_samples=self.k,
                                          output_dtype=tf.int32)

            self._sess = tf.InteractiveSession()
            self._sess.run(tf.global_variables_initializer())
            var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          scope='bert')
            cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='cls')
            self._saver = tf.train.Saver(var_list=var_lists + cls)
            attns = _extract_attention_weights(bert_config.num_hidden_layers,
                                               tf.get_default_graph())
            self.attns = attns