Ejemplo n.º 1
0
def get_mlm_logits(model, albert_config, mlm_positions):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range),
            )
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            "output_bias",
            shape=[albert_config.vocab_size],
            initializer=tf.zeros_initializer(),
        )
        logits = tf.matmul(input_tensor,
                           model.get_embedding_table(),
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Ejemplo n.º 2
0
def get_sentence_order_output(albert_config, input_tensor, labels):
    """Get loss and log probs for the next sentence prediction."""

    # Simple binary classification. Note that 0 is "next sentence" and 1 is
    # "random sentence". This weight matrix is not used after pre-training.
    with tf.variable_scope('cls/seq_relationship'):
        output_weights = tf.get_variable(
            'output_weights',
            shape = [2, albert_config.hidden_size],
            initializer = modeling.create_initializer(
                albert_config.initializer_range
            ),
        )
        output_bias = tf.get_variable(
            'output_bias', shape = [2], initializer = tf.zeros_initializer()
        )

        logits = tf.matmul(input_tensor, output_weights, transpose_b = True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis = -1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels, depth = 2, dtype = tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis = -1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, log_probs)
Ejemplo n.º 3
0
def get_masked_lm_output(
    albert_config,
    input_tensor,
    output_weights,
    positions,
    label_ids,
    label_weights,
):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope('cls/predictions'):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope('transform'):
            input_tensor = tf.layers.dense(
                input_tensor,
                units = albert_config.embedding_size,
                activation = modeling.get_activation(albert_config.hidden_act),
                kernel_initializer = modeling.create_initializer(
                    albert_config.initializer_range
                ),
            )
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            'output_bias',
            shape = [albert_config.vocab_size],
            initializer = tf.zeros_initializer(),
        )
        logits = tf.matmul(input_tensor, output_weights, transpose_b = True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis = -1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(
            label_ids, depth = albert_config.vocab_size, dtype = tf.float32
        )

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            log_probs * one_hot_labels, axis = [-1]
        )
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Ejemplo n.º 4
0
def get_sentence_order_logits(input_tensor, albert_config):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    return logits
Ejemplo n.º 5
0
def create_model(albert_config, is_training, input_ids, input_mask,
                 segment_ids, input_cdc_ids, age, sex_ids, labels, num_labels,
                 use_one_hot_embeddings):
    """Creates a classification model."""
    if not FLAGS.cdc_only:
        model = modeling.AlbertModel(
            config=albert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        # In the demo, we are doing a simple classification task on the entire
        # segment.
        #
        # If you want to use the token-level output, use model.get_sequence_output()
        # instead.
        if FLAGS.use_pooled_output:
            tf.logging.info("using pooled output")
            output_albert_layer = model.get_pooled_output()
        else:
            tf.logging.info("using meaned output")
            output_albert_layer = tf.reduce_mean(model.get_sequence_output(),
                                                 axis=1)

    with tf.variable_scope('cdc'):
        with tf.variable_scope("embedding"):
            embedding_table = tf.get_variable(
                name="embedding_table",
                shape=[FLAGS.cdc_vocab_size, FLAGS.cdc_embedding_size],
                initializer=modeling.create_initializer())
            embedded = tf.nn.embedding_lookup(embedding_table, input_cdc_ids)
            mask = tf.not_equal(input_cdc_ids, 0)
            embed_average = tf.keras.layers.GlobalAveragePooling1D()(embedded,
                                                                     mask)
            embed_max = tf.keras.layers.GlobalMaxPooling1D()(embedded)
            concat_max_average = tf.concat([embed_average, embed_max], axis=-1)


#            concat_sex_age = tf.concat([average, age, sex_ids], axis=-1)
#
#        with tf.variable_scope("dense_1"):
#            input_size = concat_sex_age.shape[-1].value
#            output_size = 2 * FLAGS.cdc_embedding_size
#
#            W = tf.get_variable(name="kernel",
#                                shape=[input_size, output_size],
#                                initializer=modeling.create_initializer())
#            b = tf.get_variable(name="bias",
#                                shape=[output_size],
#                                initializer=tf.zeros_initializer)
#            dense_1 = tf.matmul(concat_sex_age, W)
#            dense_1 = tf.nn.bias_add(dense_1, b)
#            dense_1 = tf.nn.relu(dense_1)
#
#        with tf.variable_scope("dense_2"):
#            input_size = dense_1.shape[-1].value
#            output_size = FLAGS.cdc_embedding_size
#            W = tf.get_variable(name="kernel",
#                                shape=[input_size, output_size],
#                                initializer=modeling.create_initializer())
#            b = tf.get_variable(name="bias",
#                                shape=[output_size],
#                                initializer=tf.zeros_initializer)
#            dense_2 = tf.matmul(dense_1, W)
#            dense_2 = tf.nn.bias_add(dense_2, b)
#            dense_2 = tf.nn.relu(dense_2)

        output_cdc_layer = tf.concat([age, sex_ids, concat_max_average],
                                     axis=-1)

    # Concatenate the output_layer with other features
    if FLAGS.cdc_only:
        output_layer = output_cdc_layer
    else:
        output_layer = tf.concat([output_albert_layer, output_cdc_layer],
                                 axis=-1)

    hidden_size = output_layer.shape[-1].value

    with tf.variable_scope("output"):
        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))
        output_bias = tf.get_variable("output_bias", [num_labels],
                                      initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, probabilities, predictions)
Ejemplo n.º 6
0
    def __init__(self, bert_config, tokenizer):
        _graph = tf.Graph()
        with _graph.as_default():
            self.X = tf.placeholder(tf.int32, [None, None])
            self.top_p = tf.placeholder(tf.float32, None)
            self.top_k = tf.placeholder(tf.int32, None)
            self.k = tf.placeholder(tf.int32, None)
            self.temperature = tf.placeholder(tf.float32, None)
            self.indices = tf.placeholder(tf.int32, [None, None])
            self.MASK = tf.placeholder(tf.int32, [None, None])
            self._tokenizer = tokenizer

            self.model = modeling.AlbertModel(
                config=bert_config,
                is_training=False,
                input_ids=self.X,
                input_mask=self.MASK,
                use_one_hot_embeddings=False,
            )
            self.logits = self.model.get_pooled_output()
            input_tensor = self.model.get_sequence_output()
            output_weights = self.model.get_embedding_table()

            with tf.variable_scope('cls/predictions'):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        input_tensor,
                        units=bert_config.embedding_size,
                        activation=modeling.get_activation(
                            bert_config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            bert_config.initializer_range),
                    )
                    input_tensor = modeling.layer_norm(input_tensor)

                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[bert_config.vocab_size],
                    initializer=tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor,
                                   output_weights,
                                   transpose_b=True)
                self._logits = tf.nn.bias_add(logits, output_bias)
                self._log_softmax = tf.nn.log_softmax(self._logits, axis=-1)

            logits = tf.gather_nd(self._logits, self.indices)
            logits = logits / self.temperature

            def necleus():
                return top_p_logits(logits, self.top_p)

            def select_k():
                return top_k_logits(logits, self.top_k)

            logits = tf.cond(self.top_p > 0, necleus, select_k)
            self.samples = tf.multinomial(logits,
                                          num_samples=self.k,
                                          output_dtype=tf.int32)

            self._sess = tf.InteractiveSession()
            self._sess.run(tf.global_variables_initializer())
            var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          scope='bert')
            cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='cls')
            self._saver = tf.train.Saver(var_list=var_lists + cls)
            attns = _extract_attention_weights(bert_config.num_hidden_layers,
                                               tf.get_default_graph())
            self.attns = attns