Beispiel #1
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 image_vector, use_one_hot_embeddings, scope):
    """Creates a model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings,
                               scope=scope)

    if FLAGS.ignore_image:
        logit = tf.layers.dense(model.get_pooled_output(),
                                1,
                                activation=tf.tanh,
                                kernel_initializer=modeling.create_initializer(
                                    bert_config.initializer_range))
        logit = tf.squeeze(logit, axis=1)
    else:
        logit = tf.einsum("ij,ij->i",
                          tf.layers.dense(
                              image_vector,
                              bert_config.hidden_size,
                              activation=tf.tanh,
                              kernel_initializer=modeling.create_initializer(
                                  bert_config.initializer_range)),
                          model.get_pooled_output(),
                          name="inner")

    return tf.stack([-logit, logit], axis=1)
Beispiel #2
0
 def __init__(
     self,
 ):
     self.X = tf.placeholder(tf.int32, [None, None])
     
     model = modeling.BertModel(
         config=bert_config,
         is_training=False,
         input_ids=self.X,
         use_one_hot_embeddings=False)
     
     output_layer = model.get_sequence_output()
     embedding = model.get_embedding_table()
     
     with tf.variable_scope('cls/predictions'):
         with tf.variable_scope('transform'):
             input_tensor = tf.layers.dense(
                 output_layer,
                 units = bert_config.hidden_size,
                 activation = modeling.get_activation(bert_config.hidden_act),
                 kernel_initializer = modeling.create_initializer(
                     bert_config.initializer_range
                 ),
             )
             input_tensor = modeling.layer_norm(input_tensor)
         
         output_bias = tf.get_variable(
         'output_bias',
         shape = [bert_config.vocab_size],
         initializer = tf.zeros_initializer(),
         )
         logits = tf.matmul(input_tensor, embedding, transpose_b = True)
         self.logits = tf.nn.bias_add(logits, output_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs_student = tf.nn.log_softmax(logits, axis=-1)
    probs_student = tf.nn.softmax(logits, axis=-1)

    prob_shape = tf.shape(log_probs_student)
    new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor]

    top_k_indices = tf.reshape(top_k_indices, new_shape)
    top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices)
    top_k_probs_student = tf.batch_gather(probs_student, top_k_indices)

    return top_k_log_probs_student, top_k_probs_student
Beispiel #4
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

    return logits
 def __init__(self, config, is_training, scope_prefix=""):
     config = copy.deepcopy(config)
     if not is_training:
         config.hidden_dropout_prob = 0.0
         config.attention_probs_dropout_prob = 0.0
     self._config = config
     self._initializer = modeling.create_initializer(
         config.initializer_range)
     self.scope_prefix = scope_prefix + "/" if scope_prefix else ""
     with tf.variable_scope(self.scope_prefix + "bert/embeddings"):
         self._embedding_table = tf.get_variable(
             name="word_embeddings",
             shape=[config.vocab_size, config.hidden_size],
             initializer=self.initializer)
         self._segment_table = tf.get_variable(
             name="segment_embeddings",
             shape=[config.max_segments, config.hidden_size],
             initializer=self.initializer)
         self._position_table = tf.get_variable(
             name="position_embeddings",
             shape=[config.max_positions, self.config.hidden_size],
             initializer=self.initializer)
         self._condition_position_table = tf.get_variable(
             name="condition_position_embeddings",
             shape=[config.max_conditions, self.config.hidden_size],
             initializer=self.initializer)
         self._image_region_table = tf.get_variable(
             name="image_region_embeddings",
             shape=[config.max_image_regions, self.config.hidden_size],
             initializer=self.initializer)
         self._image_order_table = tf.get_variable(
             name="image_order_embeddings",
             shape=[config.max_image_regions, self.config.hidden_size],
             initializer=self.initializer)
Beispiel #6
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    num_labels = 2  # This is hardcoded for binary classification

    with tf.variable_scope("cls/seq_relationship"):
        output_weights = tf.get_variable(
            "output_weights",
            shape=[num_labels, hidden_size],
            initializer=modeling.create_initializer(
                bert_config.initializer_range))
        output_bias = tf.get_variable("output_bias",
                                      shape=[num_labels],
                                      initializer=tf.zeros_initializer())

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        probabilities = tf.nn.softmax(logits, axis=-1)
        labels = tf.reshape(labels, [-1])
        one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, logits, probabilities)
Beispiel #7
0
    def get_masked_lm_output(self):
        self.input_tensor = self.gather_indexes()
        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                self.input_tensor = tf.layers.dense(
                    self.input_tensor,
                    units=self.bert_config.hidden_size,
                    activation=modeling.get_activation(
                        self.bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self.bert_config.initializer_range))
                self.input_tensor = modeling.layer_norm(self.input_tensor)
            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[self.bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(self.input_tensor,
                               self.output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            flat_masked_lm_ids = tf.reshape(self.masked_lm_ids, [-1])
            one_hot_labels = tf.one_hot(flat_masked_lm_ids,
                                        depth=self.bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])

            # TODO: dynamic gather from per_example_loss???
            loss = tf.reshape(per_example_loss,
                              [-1, tf.shape(self.masked_lm_positions)[1]])
            return loss
 def _project(t, name):
     return dense_layer_3d(
         input_tensor=t,
         num_attention_heads=num_attention_heads,
         size_per_head=size_per_head,
         initializer=modeling.create_initializer(initializer_range),
         activation=None,
         name=name)
    def compute_image_transformer(
        self,
        input_ids,
        input_image,
        input_image_mask,
        input_positions,
        reuse=None,
    ):
        """Build the image transformer."""
        with tf.variable_scope(self.scope_prefix + "transformer", reuse=reuse):
            with tf.variable_scope("bridge"):
                image_emb = tf.layers.dense(
                    inputs=input_image,
                    units=self.config.hidden_size,
                    activation=tf.nn.relu,
                    kernel_initializer=modeling.create_initializer(
                        self.config.initializer_range),
                    reuse=reuse)

            with tf.variable_scope("embeddings"):
                input_emb = tf.gather(self.embedding_table, input_ids)
                image_emb = tf.concat([input_emb, image_emb], axis=1)
                batch_size = tensor_utils.shape(image_emb, 0)
                sequence_length = tensor_utils.shape(image_emb, 1)
                position_emb = tf.gather(self.image_region_table,
                                         input_positions)
                position_emb = tf.pad(position_emb, [[0, 0], [1, 0], [0, 0]])
                input_order = tf.range(tensor_utils.shape(image_emb, 1))
                input_order = tf.tile(tf.expand_dims(input_order, 0),
                                      [tensor_utils.shape(image_emb, 0), 1])
                order_emb = tf.gather(self.image_order_table, input_order)
                input_segment_id = tf.fill([batch_size, sequence_length],
                                           self.IMG)
                segment_emb = tf.gather(self.segment_table, input_segment_id)
                input_emb = image_emb + position_emb + order_emb + segment_emb
                input_emb = modeling.layer_norm_and_dropout(
                    input_emb, self.config.hidden_dropout_prob)

            with tf.variable_scope("image/encoder"):
                sequence_output, output_cache = compute_transformer(
                    input_tensor=input_emb,
                    attention_mask=tf.expand_dims(input_image_mask, 1),
                    hidden_size=self.config.hidden_size,
                    num_hidden_layers=self.config.num_hidden_layers,
                    num_attention_heads=self.config.num_attention_heads,
                    intermediate_size=self.config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        self.config.hidden_act),
                    hidden_dropout_prob=self.config.hidden_dropout_prob,
                    attention_probs_dropout_prob=(
                        self.config.attention_probs_dropout_prob),
                    initializer_range=self.config.initializer_range,
                    input_cache=None)
            return sequence_output, output_cache
Beispiel #10
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_weights, truncated_masked_lm_probs_teacher,
                         top_k_indices, truncation_factor):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs_student = tf.nn.log_softmax(logits, axis=-1)

        label_weights = tf.reshape(label_weights, [-1])

        prob_shape = tf.shape(log_probs_student)
        new_shape = [prob_shape[0], truncation_factor
                     ]  #[batch_size*seq_len,truncation_factor]

        top_k_indices = tf.reshape(top_k_indices, new_shape)
        top_k_log_probs_student = tf.batch_gather(log_probs_student,
                                                  top_k_indices)

        truncated_masked_lm_probs_teacher = tf.reshape(
            truncated_masked_lm_probs_teacher, new_shape)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            truncated_masked_lm_probs_teacher * top_k_log_probs_student,
            axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs_student)
Beispiel #11
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights_flat = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])

        numerator = tf.reduce_sum(label_weights_flat * per_example_loss)
        denominator = tf.reduce_sum(label_weights_flat) + 1e-5
        loss = numerator / denominator

        batch_size = tf.cast(tf.shape(label_weights)[0], tf.float32)
        print('==============')
        print(label_weights.shape)
        print('==============')
        loss = batch_size * loss

    return (loss, per_example_loss, log_probs)
Beispiel #12
0
        def forward(x, segment, masks, y, reuse=False, config=bert_config):
            with tf.variable_scope('bert', reuse=reuse):
                model = modeling.BertModel(
                    config=config,
                    is_training=training,
                    input_ids=x,
                    input_mask=masks,
                    token_type_ids=segment,
                    use_one_hot_embeddings=False,
                )
                memory = model.get_sequence_output()
            with tf.variable_scope('bert', reuse=True):
                Y_seq_len = tf.count_nonzero(y, 1, dtype=tf.int32)
                y_masks = tf.sequence_mask(Y_seq_len,
                                           tf.reduce_max(Y_seq_len),
                                           dtype=tf.float32)

                model = modeling_decoder.BertModel(
                    config=config,
                    is_training=training,
                    input_ids=y,
                    input_mask=y_masks,
                    memory=memory,
                    memory_mask=masks,
                    use_one_hot_embeddings=False,
                )
                output_layer = model.get_sequence_output()
                embedding = model.get_embedding_table()

            with tf.variable_scope('cls/predictions', reuse=reuse):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        output_layer,
                        units=config.hidden_size,
                        activation=modeling.get_activation(
                            bert_config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            bert_config.initializer_range),
                    )
                input_tensor = modeling.layer_norm(input_tensor)

                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[bert_config.vocab_size],
                    initializer=tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor, embedding, transpose_b=True)
                return logits
    def bert_module_fn(is_training):
        """Spec function for a token embedding module."""

        input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")

        bert_config = modeling.BertConfig.from_json_file(config_path)
        model = modeling.BertModel(config=bert_config, is_training=is_training,
                                   input_ids=input_ids)

        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

        config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
        vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
        lower_case = tf.constant(do_lower_case)

        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)

        input_map = {"input_ids": input_ids}

        output_map = {"logits": logits}

        output_info_map = {"vocab_file": vocab_file,
                           "do_lower_case": lower_case}

        hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
        hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
def get_next_sentence_output(bert_config, input_tensor):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, bert_config.hidden_size],
        initializer=modeling.create_initializer(bert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    next_sentence_probs = tf.nn.softmax(logits, axis=-1)

    return next_sentence_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids):
    """Get loss and log probs for the masked LM."""
    print("input tensor before gather_indexes:", input_tensor)
    input_tensor = gather_indexes(input_tensor, positions)
    print("input tensor before gather_indexes:", input_tensor)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        print(label_ids)
        label_ids = tf.reshape(label_ids, [-1])
        print(label_ids)
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        print(one_hot_labels)
        print(log_probs)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        print(per_example_loss)

        loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        print('positions: ', positions)
        print('loss', loss)
        # TODO: dynamic gather from per_example_loss???
    return loss
def get_next_sentence_output(bert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, bert_config.hidden_size],
        initializer=modeling.create_initializer(bert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)
Beispiel #17
0
def run_one_hot_embeddings(one_hot_input_ids, config):
    """Extract only the word embeddings of the original BERT model."""
    with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE):
        with tf.variable_scope("embeddings"):
            # branched from modeling.embedding_lookup
            embedding_table = tf.get_variable(
                name="word_embeddings",
                shape=[config.vocab_size, config.hidden_size],
                initializer=modeling.create_initializer(
                    config.initializer_range))

            flat_input_ids = tf.reshape(one_hot_input_ids,
                                        [-1, config.vocab_size])
            output = tf.matmul(flat_input_ids, embedding_table)

            input_shape = modeling.get_shape_list(one_hot_input_ids)

            output = tf.reshape(output,
                                input_shape[0:-1] + [config.hidden_size])

            return (output, embedding_table)
Beispiel #18
0
    def __init__(self, config, input_hidden, embedding_table):
        # Keep variable names the same as BERT
        with tf.variable_scope("cls"):
            with tf.variable_scope("predictions"):
                with tf.variable_scope("transform"):
                    self.transformed_output = tf.layers.dense(
                        input_hidden,
                        config.hidden_size,
                        activation=modeling.get_activation(config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            config.initializer_range))
                    self.transformed_output = modeling.layer_norm(
                        self.transformed_output)

                output_bias = tf.Variable(tf.zeros([config.vocab_size]),
                                          name="output_bias")
                self.final_output = tf.add(
                    tf.matmul(self.transformed_output,
                              tf.transpose(embedding_table)), output_bias)
                self.probs = tf.nn.softmax(self.final_output,
                                           name='token_probs')
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         truncation_factor):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        masked_lm_probs = tf.nn.softmax(logits, axis=-1)
        trunc_masked_lm_probs, top_indices = tf.math.top_k(masked_lm_probs,
                                                           k=truncation_factor,
                                                           sorted=False)

        max_predictions_per_seq = positions.get_shape().as_list()[1]
        truncation_factor_ = top_indices.get_shape().as_list()[1]

        trunc_masked_lm_probs = tf.reshape(
            trunc_masked_lm_probs,
            [-1, max_predictions_per_seq, truncation_factor_])
        top_indices = tf.reshape(
            top_indices, [-1, max_predictions_per_seq, truncation_factor_])
    return trunc_masked_lm_probs, top_indices
Beispiel #20
0
    def __init__(
        self,
    ):
        BERT_CONFIG = "PATH_TO/multi_cased_L-12_H-768_A-12/bert_config.json"
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
        self.X = tf.placeholder(tf.int32, [None, None])
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False,
        )

        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()

        output_layer = tf.reshape(output_layer, [-1, bert_config.hidden_size])
        with tf.variable_scope("cls/predictions"):
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                "output_bias",
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b=True)
            print("---")
            self.logits = tf.nn.bias_add(logits, output_bias)
Beispiel #21
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1,
                            mention_ids=None):
    """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    #-----------------------------------------------------------
    # reader = pywrap_tensorflow.NewCheckpointReader("gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt")
    # var_to_shape_map = reader.get_variable_to_shape_map()
    # for key in var_to_shape_map:
    #     if key == "bert/embeddings/position_embeddings":
    #         position_embedding_value = reader.get_tensor(key) # Remove this is you want to print only variable names
    # position_embedding_512value = np.array(position_embedding_value[511] * np.ones([512, 1]), dtype=np.float32)
    #-----------------------------------------------------------

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if mention_ids is not None:
        token_type_table = tf.get_variable(name='mention_marker',
                                           shape=[1, width],
                                           initializer=tf.zeros_initializer())
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(mention_ids, [-1, 1])
        flat_token_type_ids = tf.cast(flat_token_type_ids, tf.float32)
        token_type_embeddings = tf.matmul(flat_token_type_ids,
                                          token_type_table)
        #one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
        #token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    #init every 512 seperately with bert-base model
    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            #if seq_length<=512:
            if seq_length <= 512:
                full_position_embeddings = tf.get_variable(
                    name=position_embedding_name,
                    shape=[max_position_embeddings, width],
                    initializer=create_initializer(initializer_range))
                # Since the position embedding table is a learned variable, we create it
                # using a (long) sequence length `max_position_embeddings`. The actual
                # sequence length might be shorter than this, for faster training of
                # tasks that do not have long sequences.
                #
                # So `full_position_embeddings` is effectively an embedding table
                # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
                # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
                # perform a slice.
                position_embeddings = tf.slice(full_position_embeddings,
                                               [0, 0], [seq_length, -1])

            elif seq_length <= 1024:
                full_position_embeddings_former = tf.get_variable(
                    name=position_embedding_name + "_former",
                    shape=[512, width],
                    initializer=create_initializer(initializer_range))
                full_position_embeddings_latter = tf.get_variable(
                    name=position_embedding_name + "_latter",
                    shape=[512, width],
                    initializer=create_initializer(initializer_range))
                #initializer=position_embedding_512value)

                #full_position_embeddings_latter += position_embedding_512value

                full_position_embeddings_latter = tf.slice(
                    full_position_embeddings_latter, [0, 0],
                    [seq_length - 512, -1])

                position_embeddings = tf.concat(
                    [
                        full_position_embeddings_former,
                        full_position_embeddings_latter
                    ],
                    0,
                    name="large_window_size_position_embeddings")

            else:
                full_position_embeddings_first = tf.get_variable(
                    name=position_embedding_name + "_first",
                    shape=[512, width],
                    initializer=create_initializer(initializer_range))
                full_position_embeddings_second = tf.get_variable(
                    name=position_embedding_name + "_second",
                    shape=[512, width],
                    initializer=create_initializer(initializer_range))
                #initializer=position_embedding_512value)
                full_position_embeddings_third = tf.get_variable(
                    name=position_embedding_name + "_third",
                    shape=[512, width],
                    initializer=create_initializer(initializer_range))
                #initializer=position_embedding_512value)

                full_position_embeddings_third = tf.slice(
                    full_position_embeddings_third, [0, 0],
                    [seq_length - 1024, -1])

                position_embeddings = tf.concat(
                    [
                        full_position_embeddings_first,
                        full_position_embeddings_second,
                        full_position_embeddings_third
                    ],
                    0,
                    name="large_window_size_position_embeddings")

            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Beispiel #22
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=True,
                 num_labels=2,
                 max_seq_length=128):
        """Constructor for BertModel.

        Args:
          config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
            it is much faster if this is True, on the CPU or GPU, it is faster if
            this is False.
          scope: (optional) variable scope. Defaults to "bert".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        self.input_ids = tf.placeholder(dtype=tf.int32,
                                        shape=(None, max_seq_length))
        self.input_mask = tf.placeholder(dtype=tf.int8,
                                         shape=(None, max_seq_length))

        config = copy.deepcopy(config)

        input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                  dtype=tf.int32)

        with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):
            with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=self.input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.input_ids, self.input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

        # define output_weights and output_bias
        hidden_size = self.pooled_output.shape[-1].value
        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            self.output_weights = tf.get_variable(
                "output_weights", [num_labels, hidden_size],
                initializer=tf.truncated_normal_initializer(stddev=0.02))
            self.output_bias = tf.get_variable(
                "output_bias", [num_labels],
                initializer=tf.zeros_initializer())
Beispiel #23
0
    def __init__(self, bert_config, tokenizer, cls, sep):
        _graph = tf.Graph()
        with _graph.as_default():
            self.X = tf.placeholder(tf.int32, [None, None])
            self.top_p = tf.placeholder(tf.float32, None)
            self.top_k = tf.placeholder(tf.int32, None)
            self.k = tf.placeholder(tf.int32, None)
            self.temperature = tf.placeholder(tf.float32, None)
            self.indices = tf.placeholder(tf.int32, [None, None])
            self._tokenizer = tokenizer
            self._cls = cls
            self._sep = sep

            self.model = modeling.BertModel(
                config = bert_config,
                is_training = False,
                input_ids = self.X,
                use_one_hot_embeddings = False,
            )
            self.logits = self.model.get_pooled_output()
            output_layer = self.model.get_sequence_output()
            embedding = self.model.get_embedding_table()

            with tf.variable_scope('cls/predictions'):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        output_layer,
                        units = bert_config.hidden_size,
                        activation = modeling.get_activation(
                            bert_config.hidden_act
                        ),
                        kernel_initializer = modeling.create_initializer(
                            bert_config.initializer_range
                        ),
                    )
                    input_tensor = modeling.layer_norm(input_tensor)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape = [bert_config.vocab_size],
                    initializer = tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor, embedding, transpose_b = True)
                self._logits = tf.nn.bias_add(logits, output_bias)
                self._log_softmax = tf.nn.log_softmax(self._logits)

            logits = tf.gather_nd(self._logits, self.indices)
            logits = logits / self.temperature

            def necleus():
                return top_p_logits(logits, self.top_p)

            def select_k():
                return top_k_logits(logits, self.top_k)

            logits = tf.cond(self.top_p > 0, necleus, select_k)
            self.samples = tf.multinomial(
                logits, num_samples = self.k, output_dtype = tf.int32
            )

            self._sess = tf.InteractiveSession()
            self._sess.run(tf.global_variables_initializer())
            var_lists = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert'
            )
            cls = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls'
            )
            self._saver = tf.train.Saver(var_list = var_lists + cls)
            attns = _extract_attention_weights(
                bert_config.num_hidden_layers, tf.get_default_graph()
            )
            self.attns = attns
Beispiel #24
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 image_embeddings,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for a visually grounded BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".
    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2)
        batch_size = text_input_shape[0]
        text_seq_length = text_input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, text_seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, text_seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

                # Add image embeddings the rest of the input embeddings.
                self.embedding_output += tf.layers.dense(
                    image_embeddings,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.embedding_output, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
Beispiel #25
0
  def build_attn_layers(self,
                        input_tensor,
                        attn_mask_concat,
                        intermediate_size=2048,
                        intermediate_act_fn=modeling.gelu,
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        initializer_range=0.02,
                        do_return_all_layers=False):
    """See `attention_layer` defined in `bert/modeling.py`"""
    if not self.is_training:
      hidden_dropout_prob = 0.0
      attention_probs_dropout_prob = 0.0

    # input tensor shape: [batch, arg_length, BERT_hidden_size]
    # for example, using default hparams vals: [64, 128, 768]
    attention_head_size = int(self.hidden_size / self.num_attention_heads)
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    prev_output = input_tensor

    attention_type_split = self.attention_type.split("_")

    all_layer_outputs = []
    for layer_idx in range(self.num_hidden_layers):
      with tf.variable_scope(f"layer_{layer_idx}"):
        layer_input = prev_output

        if len(attention_type_split) == 3:
          indexer = layer_idx % 2
        else:  # len(attention_type_split) == 2:
          indexer = 0
        layer_attn_type = attention_type_split[indexer]

        tf.logging.info(
          f"{layer_attn_type.capitalize()} Attention at {layer_idx}th Layer")

        attention_heads = []
        with tf.variable_scope(f"{layer_attn_type}_attn"):
          attention_head = self.build_attn_layer(
            input_tensor=input_tensor,
            attn_mask_concat=attn_mask_concat,
            layer_attn_type=layer_attn_type,
            num_attention_heads=self.num_attention_heads,
            size_per_head=attention_head_size,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=False
          )

          attention_heads.append(attention_head)

          attention_output = None
          if len(attention_heads) == 1:
            attention_output = attention_heads[0]
          else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)

          # Run a linear projection of `hidden_size` then add a residual
          # with `layer_input`.
          with tf.variable_scope("output"):
            attention_output = tf.layers.dense(
              attention_output,
              self.hidden_size,
              kernel_initializer=modeling.create_initializer(initializer_range))
            attention_output = modeling.dropout(attention_output,
                                                hidden_dropout_prob)
            attention_output = modeling.layer_norm(attention_output + layer_input)

        # The activation is only applied to the "intermediate" hidden layer.
        with tf.variable_scope("intermediate"):
          intermediate_output = tf.layers.dense(
            attention_output,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=modeling.create_initializer(initializer_range))

        # Down-project back to `hidden_size` then add the residual.
        with tf.variable_scope("output"):
          layer_output = tf.layers.dense(
            intermediate_output,
            self.hidden_size,
            kernel_initializer=modeling.create_initializer(initializer_range))
          layer_output = modeling.dropout(layer_output, hidden_dropout_prob)
          layer_output = modeling.layer_norm(layer_output + attention_output)
          prev_output = layer_output
          all_layer_outputs.append(layer_output)

    if do_return_all_layers:
      final_outputs = []
      for layer_output in all_layer_outputs:
        final_output = modeling.reshape_from_matrix(layer_output, input_shape)
        final_outputs.append(final_output)
      return final_outputs
    else:
      final_output = modeling.reshape_from_matrix(prev_output, input_shape)
      return final_output
Beispiel #26
0
    def __init__(self, config, is_training, input_tensor, input_mask,
                 token_type_ids):
        """Constructor for BertFlexEmbeddingModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_tensor: float32 Tensor of shape [batch_size, seq_length,
        hidden_size].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE):
            with tf.variable_scope("embeddings"):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=input_tensor,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_tensor, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
Beispiel #27
0
    def __call__(self, features, hidden_feature, mode, problem_name):
        """Get loss and log probs for the masked LM.

        DO NOT CHANGE THE VARAIBLE SCOPE.
        """
        seq_hidden_feature = hidden_feature['seq']
        positions = features['masked_lm_positions']
        input_tensor = gather_indexes(seq_hidden_feature, positions)
        output_weights = hidden_feature['embed_table']
        label_ids = features['masked_lm_ids']
        label_weights = features['masked_lm_weights']

        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=self.params.mask_lm_hidden_size,
                    activation=modeling.get_activation(
                        self.params.mask_lm_hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self.params.mask_lm_initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)

            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[self.params.vocab_size],
                                          initializer=tf.zeros_initializer())

            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            self.logits = logits
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            if mode == tf.estimator.ModeKeys.PREDICT:
                self.prob = log_probs
                return self.prob

            else:

                label_ids = tf.reshape(label_ids, [-1])
                label_weights = tf.reshape(label_weights, [-1])

                one_hot_labels = tf.one_hot(label_ids,
                                            depth=self.params.vocab_size,
                                            dtype=tf.float32)

                # The `positions` tensor might be zero-padded (if the sequence is too
                # short to have the maximum number of predictions). The `label_weights`
                # tensor has a value of 1.0 for every real prediction and 0.0 for the
                # padding predictions.
                per_example_loss = - \
                    tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
                numerator = tf.reduce_sum(label_weights * per_example_loss)
                denominator = tf.reduce_sum(label_weights) + 1e-5
                loss = numerator / denominator

                if mode == tf.estimator.ModeKeys.TRAIN:
                    self.loss = loss
                    return self.loss

                else:

                    def metric_fn(masked_lm_example_loss, masked_lm_log_probs,
                                  masked_lm_ids, masked_lm_weights):
                        """Computes the loss and accuracy of the model."""
                        masked_lm_log_probs = tf.reshape(
                            masked_lm_log_probs,
                            [-1, masked_lm_log_probs.shape[-1]])
                        masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                          axis=-1,
                                                          output_type=tf.int32)
                        masked_lm_example_loss = tf.reshape(
                            masked_lm_example_loss, [-1])
                        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                        masked_lm_accuracy = tf.metrics.accuracy(
                            labels=masked_lm_ids,
                            predictions=masked_lm_predictions,
                            weights=masked_lm_weights)
                        masked_lm_mean_loss = tf.metrics.mean(
                            values=masked_lm_example_loss,
                            weights=masked_lm_weights)

                        return {
                            "masked_lm_accuracy": masked_lm_accuracy,
                            "masked_lm_loss": masked_lm_mean_loss,
                        }

                    eval_metrics = (metric_fn(per_example_loss, log_probs,
                                              label_ids, label_weights), loss)

                    self.eval_metrics = eval_metrics
                    return self.eval_metrics
Beispiel #28
0
  def build(self, scope=None):
    if not self.is_training:
      self.hidden_dropout_prob = 0.0
      self.attention_probs_dropout_prob = 0.0

    with tf.variable_scope(scope, default_name="attentional_model"):
      self.build_input_pipeline()

      if self.is_bert_embedding:
        input_concat = self.embedding.get_bert_arg()
        mask_concat = self.embedding.get_attn_mask()

      else:
        self.embedding_table = self.init_embedding(self.embedding_placeholder)

        # embedding lookup
        with tf.variable_scope("embedding"):
          arg1 = tf.nn.embedding_lookup(self.embedding_table, self.arg1)
          arg2 = tf.nn.embedding_lookup(self.embedding_table, self.arg2)

        input_concat = tf.concat([arg1, arg2], axis=1)
        mask_concat = tf.concat([self.arg1_attn_mask, self.arg2_attn_mask],
                                axis=1)

      # if word_vector_width and hidden_size do not match, need to project
      if self.word_vector_width != self.hidden_size:
        with tf.variable_scope("bert_projection"):
          input_concat = tf.layers.dense(
            name="dense",
            inputs=input_concat,
            units=self.hidden_size,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            use_bias=False
          )

      # with tf.variable_scope("embedding_postprocess"):
      #   # # if not self.is_finetunable_bert_embedding:
      #   # additional context encoding with segment_ids and positional encoding
      #   # ONLY when BERT is not being fine-tuned
      #   batch_size = modeling.get_shape_list(input_concat, expected_rank=3)[0]
      #   segment_ids = tf.concat([
      #     tf.zeros([batch_size, self.max_arg_length], dtype=tf.int32),
      #     tf.ones([batch_size, self.max_arg_length], dtype=tf.int32)
      #   ], axis=1)
      #
      #   input_concat = self.encode_concat_context(
      #     input_concat, segment_ids,
      #     hidden_dropout_prob=self.hidden_dropout_prob, use_segment_ids=True,
      #     use_position_embedding=True)

      with tf.variable_scope("encoder"):
        # attention layers, for now keeping all encoder layers
        self.all_encoder_layers = \
          self.build_attn_layers(input_concat,
                                 attn_mask_concat=mask_concat,
                                 hidden_dropout_prob=self.hidden_dropout_prob,
                                 do_return_all_layers=True)

      self.sequence_output = self.all_encoder_layers[-1]

      with tf.variable_scope("pooler"):
        # see `CLS_ACTIONS` defined in `const.py`
        pooled_tensor = self.apply_cls_pooling_fn(self.sequence_output)

        pooled_output = tf.layers.dense(
          pooled_tensor,
          self.hidden_size,
          activation=tf.tanh,
          kernel_initializer=modeling.create_initializer()
        )

    logits = self.build_loss_op(pooled_output)

    self.preds = tf.cast(tf.argmax(logits, axis=-1), tf.int32, name="preds")
    self.correct = tf.cast(tf.equal(self.preds, self.label), "float",
                           name="correct")
    self.acc = tf.reduce_mean(self.correct, name="acc")

    tf.summary.scalar('loss', self.loss)
    tf.summary.scalar('accuracy', self.acc)

    self.train_op = self.build_train_op()
Beispiel #29
0
  def encode_concat_context(self,
                            input_tensor,
                            segment_ids,
                            segment_vocab_size=16,
                            max_position_embeddings=512,
                            hidden_dropout_prob=0.1,
                            initializer_range=0.02,
                            use_segment_ids=False,
                            use_position_embedding=False):
    """See `embedding_postprocessor` defined in `bert/modeling.py`"""
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_segment_ids:
      segment_table = tf.get_variable(
        name="segment_embeddings",
        shape=[segment_vocab_size, width],
        initializer=modeling.create_initializer(initializer_range))

      flat_segment_ids = tf.reshape(segment_ids, [-1]) # flatten
      one_hot_ids = tf.one_hot(flat_segment_ids, depth=segment_vocab_size)
      segment_embeddings = tf.matmul(one_hot_ids, segment_table)
      segment_embeddings = tf.reshape(segment_embeddings,
                                      [batch_size, seq_length, width])
      output += segment_embeddings

    if use_position_embedding:
      position_embeddings = tf.get_variable(
        name="position_embeddings",
        shape=[max_position_embeddings, width],
        initializer=modeling.create_initializer(initializer_range))

      # Since the position embedding table is a learned variable, we create it
      # using a (long) sequence length `max_position_embeddings`. The actual
      # sequence length might be shorter than this, for faster training of
      # tasks that do not have long sequences.
      #
      # So `full_position_embeddings` is effectively an embedding table
      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
      # perform a slice.
      position_embeddings = tf.slice(position_embeddings, [0, 0],
                                     [seq_length, -1])
      num_dims = len(output.shape.as_list())

      # Only the last two dimensions are relevant (`seq_length` and `width`), so
      # we broadcast among the first dimensions, which is typically just
      # the batch size.
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings,
                                       position_broadcast_shape)
      output += position_embeddings

    output = modeling.layer_norm_and_dropout(output, hidden_dropout_prob)
    return output
Beispiel #30
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1,
                            mention_ids=None):
    """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if mention_ids is not None:
        token_type_table = tf.get_variable(name='mention_marker',
                                           shape=[1, width],
                                           initializer=tf.zeros_initializer())
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(mention_ids, [-1, 1])
        flat_token_type_ids = tf.cast(flat_token_type_ids, tf.float32)
        token_type_embeddings = tf.matmul(flat_token_type_ids,
                                          token_type_table)
        #one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
        #token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output