예제 #1
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

    return logits
예제 #2
0
    def __init__(self, config, input_embedding, input_mask=None):
        input_shape = modeling.get_shape_list(input_embedding, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        # Keep variable names the same as BERT
        with tf.variable_scope("bert"):
            with tf.variable_scope("encoder"):
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_embedding, input_mask)

                all_encoder_layers = modeling.transformer_model(
                    input_tensor=input_embedding,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

                self.sequence_output = all_encoder_layers[-1]
예제 #3
0
 def __init__(
     self,
 ):
     self.X = tf.placeholder(tf.int32, [None, None])
     
     model = modeling.BertModel(
         config=bert_config,
         is_training=False,
         input_ids=self.X,
         use_one_hot_embeddings=False)
     
     output_layer = model.get_sequence_output()
     embedding = model.get_embedding_table()
     
     with tf.variable_scope('cls/predictions'):
         with tf.variable_scope('transform'):
             input_tensor = tf.layers.dense(
                 output_layer,
                 units = bert_config.hidden_size,
                 activation = modeling.get_activation(bert_config.hidden_act),
                 kernel_initializer = modeling.create_initializer(
                     bert_config.initializer_range
                 ),
             )
             input_tensor = modeling.layer_norm(input_tensor)
         
         output_bias = tf.get_variable(
         'output_bias',
         shape = [bert_config.vocab_size],
         initializer = tf.zeros_initializer(),
         )
         logits = tf.matmul(input_tensor, embedding, transpose_b = True)
         self.logits = tf.nn.bias_add(logits, output_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs_student = tf.nn.log_softmax(logits, axis=-1)
    probs_student = tf.nn.softmax(logits, axis=-1)

    prob_shape = tf.shape(log_probs_student)
    new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor]

    top_k_indices = tf.reshape(top_k_indices, new_shape)
    top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices)
    top_k_probs_student = tf.batch_gather(probs_student, top_k_indices)

    return top_k_log_probs_student, top_k_probs_student
예제 #5
0
    def get_masked_lm_output(self):
        self.input_tensor = self.gather_indexes()
        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                self.input_tensor = tf.layers.dense(
                    self.input_tensor,
                    units=self.bert_config.hidden_size,
                    activation=modeling.get_activation(
                        self.bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self.bert_config.initializer_range))
                self.input_tensor = modeling.layer_norm(self.input_tensor)
            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[self.bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(self.input_tensor,
                               self.output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            flat_masked_lm_ids = tf.reshape(self.masked_lm_ids, [-1])
            one_hot_labels = tf.one_hot(flat_masked_lm_ids,
                                        depth=self.bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])

            # TODO: dynamic gather from per_example_loss???
            loss = tf.reshape(per_example_loss,
                              [-1, tf.shape(self.masked_lm_positions)[1]])
            return loss
    def compute_image_transformer(
        self,
        input_ids,
        input_image,
        input_image_mask,
        input_positions,
        reuse=None,
    ):
        """Build the image transformer."""
        with tf.variable_scope(self.scope_prefix + "transformer", reuse=reuse):
            with tf.variable_scope("bridge"):
                image_emb = tf.layers.dense(
                    inputs=input_image,
                    units=self.config.hidden_size,
                    activation=tf.nn.relu,
                    kernel_initializer=modeling.create_initializer(
                        self.config.initializer_range),
                    reuse=reuse)

            with tf.variable_scope("embeddings"):
                input_emb = tf.gather(self.embedding_table, input_ids)
                image_emb = tf.concat([input_emb, image_emb], axis=1)
                batch_size = tensor_utils.shape(image_emb, 0)
                sequence_length = tensor_utils.shape(image_emb, 1)
                position_emb = tf.gather(self.image_region_table,
                                         input_positions)
                position_emb = tf.pad(position_emb, [[0, 0], [1, 0], [0, 0]])
                input_order = tf.range(tensor_utils.shape(image_emb, 1))
                input_order = tf.tile(tf.expand_dims(input_order, 0),
                                      [tensor_utils.shape(image_emb, 0), 1])
                order_emb = tf.gather(self.image_order_table, input_order)
                input_segment_id = tf.fill([batch_size, sequence_length],
                                           self.IMG)
                segment_emb = tf.gather(self.segment_table, input_segment_id)
                input_emb = image_emb + position_emb + order_emb + segment_emb
                input_emb = modeling.layer_norm_and_dropout(
                    input_emb, self.config.hidden_dropout_prob)

            with tf.variable_scope("image/encoder"):
                sequence_output, output_cache = compute_transformer(
                    input_tensor=input_emb,
                    attention_mask=tf.expand_dims(input_image_mask, 1),
                    hidden_size=self.config.hidden_size,
                    num_hidden_layers=self.config.num_hidden_layers,
                    num_attention_heads=self.config.num_attention_heads,
                    intermediate_size=self.config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        self.config.hidden_act),
                    hidden_dropout_prob=self.config.hidden_dropout_prob,
                    attention_probs_dropout_prob=(
                        self.config.attention_probs_dropout_prob),
                    initializer_range=self.config.initializer_range,
                    input_cache=None)
            return sequence_output, output_cache
예제 #7
0
    def build_bert_model(self, input_ids, input_mask, token_type_ids):
        with tf.variable_scope('bert'):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (embedding_output, _) = modeling.embedding_lookup(
                    input_ids=input_ids,
                    vocab_size=self.bert_config.vocab_size,
                    embedding_size=self.bert_config.hidden_size,
                    initializer_range=self.bert_config.initializer_range,
                    word_embedding_name="word_embeddings",
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                embedding_output = modeling.embedding_postprocessor(
                    input_tensor=embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=self.bert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=self.bert_config.initializer_range,
                    max_position_embeddings=self.bert_config.
                    max_position_embeddings,
                    dropout_prob=self.bert_config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # Run the stacked transformer, only fetching the final lyaer
                # `final_layer` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=self.bert_config.hidden_size,
                    num_hidden_layers=self.bert_config.num_hidden_layers,
                    num_attention_heads=self.bert_config.num_attention_heads,
                    intermediate_size=self.bert_config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                      self.bert_config.hidden_act),
                    hidden_dropout_prob=self.bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                      self.bert_config.attention_probs_dropout_prob,
                    initializer_range=self.bert_config.initializer_range,
                    do_return_all_layers=True
                )

            self.sequence_output = self.all_encoder_layers[-1]
예제 #8
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_weights, truncated_masked_lm_probs_teacher,
                         top_k_indices, truncation_factor):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs_student = tf.nn.log_softmax(logits, axis=-1)

        label_weights = tf.reshape(label_weights, [-1])

        prob_shape = tf.shape(log_probs_student)
        new_shape = [prob_shape[0], truncation_factor
                     ]  #[batch_size*seq_len,truncation_factor]

        top_k_indices = tf.reshape(top_k_indices, new_shape)
        top_k_log_probs_student = tf.batch_gather(log_probs_student,
                                                  top_k_indices)

        truncated_masked_lm_probs_teacher = tf.reshape(
            truncated_masked_lm_probs_teacher, new_shape)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            truncated_masked_lm_probs_teacher * top_k_log_probs_student,
            axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs_student)
예제 #9
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights_flat = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])

        numerator = tf.reduce_sum(label_weights_flat * per_example_loss)
        denominator = tf.reduce_sum(label_weights_flat) + 1e-5
        loss = numerator / denominator

        batch_size = tf.cast(tf.shape(label_weights)[0], tf.float32)
        print('==============')
        print(label_weights.shape)
        print('==============')
        loss = batch_size * loss

    return (loss, per_example_loss, log_probs)
 def build(self, unused_input_shapes):
   """Implements build() for the layer."""
   self.lm_dense = tf.keras.layers.Dense(
       self.config.hidden_size,
       activation=modeling.get_activation(self.config.hidden_act),
       kernel_initializer=self.initializer)
   self.lm_bias = self.add_weight(
       shape=[self.config.vocab_size],
       name='lm_bias',
       initializer=tf.keras.initializers.Zeros())
   self.lm_layer_norm = tf.keras.layers.LayerNormalization(
       axis=-1, epsilon=1e-12)
   self.next_sentence_dense = tf.keras.layers.Dense(
       self.num_next_sentence_label, kernel_initializer=self.initializer)
   super(BertPretrainLayer, self).build(unused_input_shapes)
    def bert_module_fn(is_training):
        """Spec function for a token embedding module."""

        input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")

        bert_config = modeling.BertConfig.from_json_file(config_path)
        model = modeling.BertModel(config=bert_config, is_training=is_training,
                                   input_ids=input_ids)

        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

        config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
        vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
        lower_case = tf.constant(do_lower_case)

        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)

        input_map = {"input_ids": input_ids}

        output_map = {"logits": logits}

        output_info_map = {"vocab_file": vocab_file,
                           "do_lower_case": lower_case}

        hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
        hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
예제 #12
0
        def forward(x, segment, masks, y, reuse=False, config=bert_config):
            with tf.variable_scope('bert', reuse=reuse):
                model = modeling.BertModel(
                    config=config,
                    is_training=training,
                    input_ids=x,
                    input_mask=masks,
                    token_type_ids=segment,
                    use_one_hot_embeddings=False,
                )
                memory = model.get_sequence_output()
            with tf.variable_scope('bert', reuse=True):
                Y_seq_len = tf.count_nonzero(y, 1, dtype=tf.int32)
                y_masks = tf.sequence_mask(Y_seq_len,
                                           tf.reduce_max(Y_seq_len),
                                           dtype=tf.float32)

                model = modeling_decoder.BertModel(
                    config=config,
                    is_training=training,
                    input_ids=y,
                    input_mask=y_masks,
                    memory=memory,
                    memory_mask=masks,
                    use_one_hot_embeddings=False,
                )
                output_layer = model.get_sequence_output()
                embedding = model.get_embedding_table()

            with tf.variable_scope('cls/predictions', reuse=reuse):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        output_layer,
                        units=config.hidden_size,
                        activation=modeling.get_activation(
                            bert_config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            bert_config.initializer_range),
                    )
                input_tensor = modeling.layer_norm(input_tensor)

                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[bert_config.vocab_size],
                    initializer=tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor, embedding, transpose_b=True)
                return logits
 def compute_logits(self, target_emb, reuse=None):
     """Compute logits for word prediction."""
     with tf.variable_scope(self.scope_prefix + "cls/predictions",
                            reuse=reuse):
         with tf.variable_scope("transform"):
             target_emb = tf.layers.dense(
                 target_emb,
                 units=self.config.hidden_size,
                 activation=modeling.get_activation(self.config.hidden_act),
                 kernel_initializer=self.initializer)
             target_emb = modeling.layer_norm(target_emb)
         output_bias = tf.get_variable("output_bias",
                                       shape=[self.config.vocab_size],
                                       initializer=tf.zeros_initializer())
     logits = tf.matmul(target_emb, self.embedding_table, transpose_b=True)
     logits = tf.nn.bias_add(logits, output_bias)
     return logits
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids):
    """Get loss and log probs for the masked LM."""
    print("input tensor before gather_indexes:", input_tensor)
    input_tensor = gather_indexes(input_tensor, positions)
    print("input tensor before gather_indexes:", input_tensor)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        print(label_ids)
        label_ids = tf.reshape(label_ids, [-1])
        print(label_ids)
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        print(one_hot_labels)
        print(log_probs)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        print(per_example_loss)

        loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        print('positions: ', positions)
        print('loss', loss)
        # TODO: dynamic gather from per_example_loss???
    return loss
예제 #15
0
    def __init__(self, config, input_hidden, embedding_table):
        # Keep variable names the same as BERT
        with tf.variable_scope("cls"):
            with tf.variable_scope("predictions"):
                with tf.variable_scope("transform"):
                    self.transformed_output = tf.layers.dense(
                        input_hidden,
                        config.hidden_size,
                        activation=modeling.get_activation(config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            config.initializer_range))
                    self.transformed_output = modeling.layer_norm(
                        self.transformed_output)

                output_bias = tf.Variable(tf.zeros([config.vocab_size]),
                                          name="output_bias")
                self.final_output = tf.add(
                    tf.matmul(self.transformed_output,
                              tf.transpose(embedding_table)), output_bias)
                self.probs = tf.nn.softmax(self.final_output,
                                           name='token_probs')
    def compute_transformer(
        self,
        input_ids,
        input_segment_id,
        input_positions,
        attention_mask,
        input_cache=None,
        reuse=None,
        conditional=False,
    ):
        """Build the full text transformer."""
        with tf.variable_scope(self.scope_prefix + "transformer", reuse=reuse):
            with tf.variable_scope("embeddings"):
                token_emb = tf.gather(self.embedding_table, input_ids)
                segment_embed = tf.gather(self.segment_table, input_segment_id)
                if conditional:
                    position_table = self.condition_position_table
                else:
                    position_table = self.position_table
                position_emb = tf.gather(position_table, input_positions)
                input_emb = token_emb + segment_embed + position_emb
                input_emb = modeling.layer_norm_and_dropout(
                    input_emb, self.config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                sequence_output, output_cache = compute_transformer(
                    input_tensor=input_emb,
                    attention_mask=attention_mask,
                    hidden_size=self.config.hidden_size,
                    num_hidden_layers=self.config.num_hidden_layers,
                    num_attention_heads=self.config.num_attention_heads,
                    intermediate_size=self.config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        self.config.hidden_act),
                    hidden_dropout_prob=self.config.hidden_dropout_prob,
                    attention_probs_dropout_prob=(
                        self.config.attention_probs_dropout_prob),
                    initializer_range=self.config.initializer_range,
                    input_cache=input_cache)
            return sequence_output, output_cache
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         truncation_factor):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        masked_lm_probs = tf.nn.softmax(logits, axis=-1)
        trunc_masked_lm_probs, top_indices = tf.math.top_k(masked_lm_probs,
                                                           k=truncation_factor,
                                                           sorted=False)

        max_predictions_per_seq = positions.get_shape().as_list()[1]
        truncation_factor_ = top_indices.get_shape().as_list()[1]

        trunc_masked_lm_probs = tf.reshape(
            trunc_masked_lm_probs,
            [-1, max_predictions_per_seq, truncation_factor_])
        top_indices = tf.reshape(
            top_indices, [-1, max_predictions_per_seq, truncation_factor_])
    return trunc_masked_lm_probs, top_indices
예제 #18
0
    def __init__(
        self,
    ):
        BERT_CONFIG = "PATH_TO/multi_cased_L-12_H-768_A-12/bert_config.json"
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
        self.X = tf.placeholder(tf.int32, [None, None])
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False,
        )

        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()

        output_layer = tf.reshape(output_layer, [-1, bert_config.hidden_size])
        with tf.variable_scope("cls/predictions"):
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                "output_bias",
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b=True)
            print("---")
            self.logits = tf.nn.bias_add(logits, output_bias)
예제 #19
0
파일: b2t.py 프로젝트: DSA-CoE/malaya
    def __init__(
        self,
        bert_config,
        input_ids,
        input_mask,
        token_type_ids,
        Y,
        is_training=True,
    ):
        self.X = input_ids
        self.segment_ids = token_type_ids
        self.input_masks = input_mask
        self.Y = Y
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False,
        )

        print(bert_config.__dict__)

        BASE_PARAMS = defaultdict(
            lambda: None,
            default_batch_size=2048,
            default_batch_size_tpu=32768,
            max_length=bert_config.max_position_embeddings,
            initializer_gain=1.0,
            vocab_size=bert_config.vocab_size,
            hidden_size=bert_config.hidden_size,
            num_hidden_layers=bert_config.num_hidden_layers,
            num_heads=bert_config.num_attention_heads,
            filter_size=bert_config.intermediate_size,
            layer_postprocess_dropout=0.1,
            attention_dropout=0.1,
            relu_dropout=0.1,
            label_smoothing=0.1,
            learning_rate=1.0,
            learning_rate_decay_rate=1.0,
            learning_rate_warmup_steps=16000,
            optimizer_adam_beta1=0.9,
            optimizer_adam_beta2=0.997,
            optimizer_adam_epsilon=1e-09,
            extra_decode_length=50,
            beam_size=4,
            alpha=0.6,
            use_tpu=False,
            static_batch=False,
            allow_ffn_pad=True,
        )

        self.decoder_stack = DecoderStack(BASE_PARAMS, is_training)
        attention_bias = model_utils.get_padding_bias(self.X)

        output_layer = model.get_sequence_output()
        pooled_output = model.get_pooled_output()
        embedding = model.get_embedding_table()

        with tf.name_scope('decode'):
            mask = tf.to_float(tf.not_equal(self.Y, 0))
            decoder_inputs = tf.gather(embedding, self.Y)
            decoder_inputs *= tf.expand_dims(mask, -1)
            with tf.name_scope('shift_targets'):
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_encoding'):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, BASE_PARAMS['hidden_size'])
            if is_training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - BASE_PARAMS['layer_postprocess_dropout'])
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(
                decoder_inputs,
                output_layer,
                decoder_self_attention_bias,
                attention_bias,
            )

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    outputs,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range),
                )
            input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            self.training_logits = tf.matmul(input_tensor,
                                             embedding,
                                             transpose_b=True)

        print(self.training_logits)
예제 #20
0
    def __init__(self, bert_config, tokenizer, cls, sep):
        _graph = tf.Graph()
        with _graph.as_default():
            self.X = tf.placeholder(tf.int32, [None, None])
            self.top_p = tf.placeholder(tf.float32, None)
            self.top_k = tf.placeholder(tf.int32, None)
            self.k = tf.placeholder(tf.int32, None)
            self.temperature = tf.placeholder(tf.float32, None)
            self.indices = tf.placeholder(tf.int32, [None, None])
            self._tokenizer = tokenizer
            self._cls = cls
            self._sep = sep

            self.model = modeling.BertModel(
                config = bert_config,
                is_training = False,
                input_ids = self.X,
                use_one_hot_embeddings = False,
            )
            self.logits = self.model.get_pooled_output()
            output_layer = self.model.get_sequence_output()
            embedding = self.model.get_embedding_table()

            with tf.variable_scope('cls/predictions'):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        output_layer,
                        units = bert_config.hidden_size,
                        activation = modeling.get_activation(
                            bert_config.hidden_act
                        ),
                        kernel_initializer = modeling.create_initializer(
                            bert_config.initializer_range
                        ),
                    )
                    input_tensor = modeling.layer_norm(input_tensor)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape = [bert_config.vocab_size],
                    initializer = tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor, embedding, transpose_b = True)
                self._logits = tf.nn.bias_add(logits, output_bias)
                self._log_softmax = tf.nn.log_softmax(self._logits)

            logits = tf.gather_nd(self._logits, self.indices)
            logits = logits / self.temperature

            def necleus():
                return top_p_logits(logits, self.top_p)

            def select_k():
                return top_k_logits(logits, self.top_k)

            logits = tf.cond(self.top_p > 0, necleus, select_k)
            self.samples = tf.multinomial(
                logits, num_samples = self.k, output_dtype = tf.int32
            )

            self._sess = tf.InteractiveSession()
            self._sess.run(tf.global_variables_initializer())
            var_lists = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert'
            )
            cls = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls'
            )
            self._saver = tf.train.Saver(var_list = var_lists + cls)
            attns = _extract_attention_weights(
                bert_config.num_hidden_layers, tf.get_default_graph()
            )
            self.attns = attns
예제 #21
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 image_embeddings,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for a visually grounded BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".
    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2)
        batch_size = text_input_shape[0]
        text_seq_length = text_input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, text_seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, text_seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

                # Add image embeddings the rest of the input embeddings.
                self.embedding_output += tf.layers.dense(
                    image_embeddings,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.embedding_output, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
예제 #22
0
    def __init__(self, config, is_training, input_tensor, input_mask,
                 token_type_ids):
        """Constructor for BertFlexEmbeddingModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_tensor: float32 Tensor of shape [batch_size, seq_length,
        hidden_size].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE):
            with tf.variable_scope("embeddings"):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=input_tensor,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_tensor, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
    def __init__(
        self,
        input_ids,
        input_mask,
        token_type_ids,
        Y,
        learning_rate=2e-5,
        is_training=True,
    ):
        self.X = input_ids
        self.segment_ids = token_type_ids
        self.input_masks = input_mask
        self.Y = Y
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False,
        )

        self.decoder_stack = DecoderStack(BASE_PARAMS, is_training)
        attention_bias = model_utils.get_padding_bias(self.X)

        output_layer = model.get_sequence_output()
        pooled_output = model.get_pooled_output()
        embedding = model.get_embedding_table()

        with tf.name_scope('decode'):
            mask = tf.to_float(tf.not_equal(self.Y, 0))
            decoder_inputs = tf.gather(embedding, self.Y)
            decoder_inputs *= tf.expand_dims(mask, -1)
            with tf.name_scope('shift_targets'):
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_encoding'):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, BASE_PARAMS['hidden_size'])
            if is_training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - BASE_PARAMS['layer_postprocess_dropout'])
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(
                decoder_inputs,
                output_layer,
                decoder_self_attention_bias,
                attention_bias,
            )

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    outputs,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range),
                )
            input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            self.training_logits = tf.matmul(input_tensor,
                                             embedding,
                                             transpose_b=True)

        print(self.training_logits)
예제 #24
0
def create_mask_model(bert_config, is_training, input_ids, input_mask,
                      segment_ids, mask_positions, use_one_hot_embeddings):
    """Creates a classification model."""

    #print("create mask model ----------------------------------------------")
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # Get the logits for the start and end predictions.
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/nq/output_weights", [2, hidden_size + 12],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("cls/nq/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    mask_positions_matrix = tf.cast(tf.reshape(mask_positions,
                                               [batch_size * seq_length, 1]),
                                    dtype=tf.float32)
    padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32)
    mask_positions_matrix = tf.concat([mask_positions_matrix, padding],
                                      axis=-1)
    final_hidden_matrix = tf.concat(
        [final_hidden_matrix, mask_positions_matrix], axis=-1)
    final_hidden_matrix = tf.reshape(
        final_hidden_matrix, [batch_size, seq_length, hidden_size + 12])
    attention_mask = modeling.create_attention_mask_from_input_mask(
        input_ids, input_mask)
    config = bert_config
    all_encoder_layers = modeling.transformer_model(
        input_tensor=final_hidden_matrix,
        attention_mask=attention_mask,
        hidden_size=config.hidden_size + 12,  # input hidden size
        num_hidden_layers=1,  #config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(config.hidden_act),
        hidden_dropout_prob=config.hidden_dropout_prob,
        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
        initializer_range=config.initializer_range,
        do_return_all_layers=True)
    #print(all_encoder_layers.shape)
    transformer_output_matrix = all_encoder_layers[-1]

    transformer_output_matrix = tf.reshape(
        transformer_output_matrix, [batch_size * seq_length, hidden_size + 12])
    logits = tf.matmul(transformer_output_matrix,
                       output_weights,
                       transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    # Get the logits for the answer type prediction.
    answer_type_output_layer = model.get_pooled_output()
    answer_type_hidden_size = answer_type_output_layer.shape[-1].value

    num_answer_types = 5  # YES, NO, UNKNOWN, SHORT, LONG
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)

    return (start_logits, end_logits, answer_type_logits)
예제 #25
0
def main(args):
    bert_config = modeling.BertConfig.from_json_file(args.config)
    bert_config.hidden_dropout_prob = 0.0
    bert_config.attention_probs_dropout_prob = 0.0

    batch_size = args.batch_size
    avg_seq_len = args.avg_seq_length
    max_seq_len = args.max_seq_length
    tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32

    # fake input array length
    input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len,
                                  high=max_seq_len + 1,
                                  size=(batch_size),
                                  dtype=np.int32)
    valid_word_num = sum(input_len)

    # fake input id and mask
    input_ids = np.random.randint(low=0,
                                  high=bert_config.vocab_size,
                                  size=(batch_size, max_seq_len),
                                  dtype=np.int32)
    input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32)
    for b_idx, s_len in enumerate(input_len):
        input_mask[b_idx][:s_len] = 1

    input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
    input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32)

    # fake embedding output
    embed_output = np.random.randn(batch_size, max_seq_len,
                                   bert_config.hidden_size)
    input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype)

    # keep attention_mask for compatible reason
    att_mask = np.tile(input_mask, max_seq_len)
    att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len)
    attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype)

    # input info
    valid_word_num = sum(input_len)
    print("Valid word num : {}/{}, avg sequence length : {:.6} ".format(
        valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size))

    # bert with standard transformer
    std_bert = modeling.transformer_model(
        input_tensor=input_tensor,
        attention_mask=attention_mask,
        hidden_size=bert_config.hidden_size,
        num_hidden_layers=bert_config.num_hidden_layers,
        num_attention_heads=bert_config.num_attention_heads,
        intermediate_size=bert_config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(bert_config.hidden_act),
        hidden_dropout_prob=bert_config.hidden_dropout_prob,
        attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
        initializer_range=bert_config.initializer_range,
        do_return_all_layers=False)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.Session(config=config) as sess:
        # init weights
        sess.run(tf.global_variables_initializer())

        # get transformer weights
        all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        transformer_vars = [v for v in all_vars if v.name.startswith('layer')]
        weights_value = sess.run(transformer_vars)

        # bert with effective transformer
        et_bert = effective_transformer.get_sequence_output(
            max_batch_size=batch_size,
            max_seq_length=max_seq_len,
            config=bert_config,
            attention_mask=attention_mask,
            input_mask=input_mask_tensor,
            from_tensor=input_tensor,
            weights_value=weights_value,
        )

        # diff
        val1 = sess.run(std_bert).reshape(-1, 768)
        val2 = sess.run(et_bert).reshape(-1, 768)
        diff = []
        for b_idx, s_len in enumerate(input_len):
            for w_idx in range(s_len):
                idx = b_idx * args.max_seq_length + w_idx
                diff.append(np.fabs(val1[idx] - val2[idx]).max())
        print("max diff : {:.6}, avg diff : {:.6}.".format(
            max(diff),
            sum(diff) / len(diff)))

        def time_inference(output_tensor):
            iter_num = 128
            # warm up
            for i in range(10):
                sess.run(output_tensor)

            beg = datetime.now()
            for i in range(iter_num):
                sess.run(output_tensor)
            end = datetime.now()
            return (end - beg).total_seconds() * 1000 / iter_num  # ms

        print("xla cost : {:.6} ms".format(time_inference(std_bert)))
        print("et  cost : {:.6} ms".format(time_inference(et_bert)))
예제 #26
0
    def __call__(self, features, hidden_feature, mode, problem_name):
        """Get loss and log probs for the masked LM.

        DO NOT CHANGE THE VARAIBLE SCOPE.
        """
        seq_hidden_feature = hidden_feature['seq']
        positions = features['masked_lm_positions']
        input_tensor = gather_indexes(seq_hidden_feature, positions)
        output_weights = hidden_feature['embed_table']
        label_ids = features['masked_lm_ids']
        label_weights = features['masked_lm_weights']

        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=self.params.mask_lm_hidden_size,
                    activation=modeling.get_activation(
                        self.params.mask_lm_hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self.params.mask_lm_initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)

            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[self.params.vocab_size],
                                          initializer=tf.zeros_initializer())

            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            self.logits = logits
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            if mode == tf.estimator.ModeKeys.PREDICT:
                self.prob = log_probs
                return self.prob

            else:

                label_ids = tf.reshape(label_ids, [-1])
                label_weights = tf.reshape(label_weights, [-1])

                one_hot_labels = tf.one_hot(label_ids,
                                            depth=self.params.vocab_size,
                                            dtype=tf.float32)

                # The `positions` tensor might be zero-padded (if the sequence is too
                # short to have the maximum number of predictions). The `label_weights`
                # tensor has a value of 1.0 for every real prediction and 0.0 for the
                # padding predictions.
                per_example_loss = - \
                    tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
                numerator = tf.reduce_sum(label_weights * per_example_loss)
                denominator = tf.reduce_sum(label_weights) + 1e-5
                loss = numerator / denominator

                if mode == tf.estimator.ModeKeys.TRAIN:
                    self.loss = loss
                    return self.loss

                else:

                    def metric_fn(masked_lm_example_loss, masked_lm_log_probs,
                                  masked_lm_ids, masked_lm_weights):
                        """Computes the loss and accuracy of the model."""
                        masked_lm_log_probs = tf.reshape(
                            masked_lm_log_probs,
                            [-1, masked_lm_log_probs.shape[-1]])
                        masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                          axis=-1,
                                                          output_type=tf.int32)
                        masked_lm_example_loss = tf.reshape(
                            masked_lm_example_loss, [-1])
                        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                        masked_lm_accuracy = tf.metrics.accuracy(
                            labels=masked_lm_ids,
                            predictions=masked_lm_predictions,
                            weights=masked_lm_weights)
                        masked_lm_mean_loss = tf.metrics.mean(
                            values=masked_lm_example_loss,
                            weights=masked_lm_weights)

                        return {
                            "masked_lm_accuracy": masked_lm_accuracy,
                            "masked_lm_loss": masked_lm_mean_loss,
                        }

                    eval_metrics = (metric_fn(per_example_loss, log_probs,
                                              label_ids, label_weights), loss)

                    self.eval_metrics = eval_metrics
                    return self.eval_metrics
예제 #27
0
def aggregate_embedding(embeddings,
                        segment_idx,
                        aggregator,
                        config=None,
                        aux=None,
                        name=None):
    # segment_idx denotes different needles, rather than rows.
    if aggregator == 'segment_sqrt_n':
        denom = to_col(
            tf.sqrt(
                tf.to_float(
                    tf.segment_sum(tf.ones_like(segment_idx), segment_idx))))
        output_layer = tf.div_no_nan(tf.segment_sum(embeddings, segment_idx),
                                     denom,
                                     name=name)
    elif aggregator in ['segment_sum', 'segment_mean']:
        output_layer = getattr(tf, aggregator)(embeddings,
                                               segment_idx,
                                               name=name)
    else:
        del embeddings
        assert aggregator.startswith('transformer')
        flags = {}
        if '^' in aggregator:
            flags = [
                kv.split('@')
                for kv in filter(None,
                                 aggregator.split('^')[1].split(','))
            ]
            flags = {k: eval(v) for k, v in flags}
        assert config is not None and aux is not None
        needle_pos = aux['needle_pos']
        embedding_output = aux['sequence_output']
        batch_idx2 = aux['batch_idx2']  # different rows.
        is_training = aux['is_training']
        attention_mask = get_dense_mask(needle_pos, batch_idx2,
                                        tf.shape(embedding_output)[:2])
        with tf.variable_scope('final_transformer'):
            all_encoder_layers = modeling.transformer_model(
                input_tensor=embedding_output,
                attention_mask=attention_mask,
                hidden_size=config.
                hidden_size,  # this must agree with input width.
                num_hidden_layers=flags.get('num_hidden_layers', 1),
                num_attention_heads=flags.get('num_attention_heads',
                                              config.num_attention_heads),
                intermediate_size=flags.get('intermediate_size',
                                            config.intermediate_size),
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=flags.get('hidden_dropout_prob',
                                              config.hidden_dropout_prob) *
                int(is_training),
                attention_probs_dropout_prob=int(is_training) *
                flags.get('attention_probs_dropout_prob',
                          config.attention_probs_dropout_prob),
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            first_token_tensor = all_encoder_layers[-1][:, 0, :]
            output_layer = tf.layers.dense(
                first_token_tensor,
                config.hidden_size,
                activation=tf.tanh,
                kernel_initializer=modeling.create_initializer(
                    config.initializer_range))

    return output_layer
예제 #28
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=True,
                 num_labels=2,
                 max_seq_length=128):
        """Constructor for BertModel.

        Args:
          config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
            it is much faster if this is True, on the CPU or GPU, it is faster if
            this is False.
          scope: (optional) variable scope. Defaults to "bert".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        self.input_ids = tf.placeholder(dtype=tf.int32,
                                        shape=(None, max_seq_length))
        self.input_mask = tf.placeholder(dtype=tf.int8,
                                         shape=(None, max_seq_length))

        config = copy.deepcopy(config)

        input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                  dtype=tf.int32)

        with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):
            with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=self.input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.input_ids, self.input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

        # define output_weights and output_bias
        hidden_size = self.pooled_output.shape[-1].value
        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            self.output_weights = tf.get_variable(
                "output_weights", [num_labels, hidden_size],
                initializer=tf.truncated_normal_initializer(stddev=0.02))
            self.output_bias = tf.get_variable(
                "output_bias", [num_labels],
                initializer=tf.zeros_initializer())
예제 #29
0
def create_bilstm_classification_model(bert_config,
                                       is_training,
                                       response_input_ids,
                                       response_input_mask,
                                       response_segment_ids,
                                       response_text_len,
                                       response_labels,
                                       random_forward_input_ids,
                                       random_forward_input_mask,
                                       random_forward_segment_ids,
                                       random_forward_text_len,
                                       random_backward_input_ids,
                                       random_backward_input_mask,
                                       random_backward_segment_ids,
                                       random_backward_text_len,
                                       random_labels,
                                       swap_forward_input_ids,
                                       swap_forward_input_mask,
                                       swap_forward_segment_ids,
                                       swap_forward_text_len,
                                       swap_backward_input_ids,
                                       swap_backward_input_mask,
                                       swap_backward_segment_ids,
                                       swap_backward_text_len,
                                       swap_labels,
                                       nli_forward_input_ids,
                                       nli_forward_input_mask,
                                       nli_forward_segment_ids,
                                       nli_forward_text_len,
                                       nli_backward_input_ids,
                                       nli_backward_input_mask,
                                       nli_backward_segment_ids,
                                       nli_backward_text_len,
                                       nli_labels,
                                       num_nli_labels,
                                       use_one_hot_embeddings,
                                       l2_reg_lambda=0.1,
                                       dropout_rate=1.0,
                                       lstm_size=None,
                                       num_layers=1):

    config = copy.deepcopy(bert_config)

    if not is_training:
        config.hidden_dropout_prob = 0.0
        config.attention_probs_dropout_prob = 0.0

    with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):

        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            (response_embedding_output,
             response_embedding_table) = modeling.embedding_lookup(
                 input_ids=response_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            response_embedding_output = modeling.embedding_postprocessor(
                input_tensor=response_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=response_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # random detection
            # Perform embedding lookup on the word ids.
            (random_foward_embedding_output,
             random_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Perform embedding lookup on the word ids.
            (random_backward_embedding_output,
             random_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Add positional embeddings and token type embeddings, then layer
            # normalize and perform dropout.
            random_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            random_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # swap detection
            (swap_foward_embedding_output,
             swap_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            (swap_backward_embedding_output,
             swap_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            swap_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            swap_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # # generic detection
            # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_forward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_backward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # generic_foward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_foward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_forward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)
            # generic_backward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_backward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_backward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)

            # nli detection
            (nli_foward_embedding_output,
             nli_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            (nli_backward_embedding_output,
             nli_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            nli_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            nli_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            response_attention_mask = modeling.create_attention_mask_from_input_mask(
                response_input_ids, response_input_mask)
            # [batch_size, from_seq_length, to_seq_length]
            # mask future tokens
            diag_vals = tf.ones_like(response_attention_mask[0, :, :])
            tril = tf.linalg.LinearOperatorLowerTriangular(
                diag_vals).to_dense()
            future_masks = tf.tile(tf.expand_dims(
                tril, 0), [tf.shape(response_attention_mask)[0], 1, 1])
            response_attention_mask = tf.math.multiply(response_attention_mask,
                                                       future_masks)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            response_all_encoder_layers = modeling.transformer_model(
                input_tensor=response_embedding_output,
                attention_mask=response_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # random detection
            # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
            # mask of shape [batch_size, seq_length, seq_length] which is used
            # for the attention scores.
            random_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_forward_input_ids, random_forward_input_mask)
            random_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_backward_input_ids, random_backward_input_mask)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            random_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_foward_embedding_output,
                attention_mask=random_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            random_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_backward_embedding_output,
                attention_mask=random_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # swap detection
            swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_forward_input_ids, swap_forward_input_mask)
            swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_backward_input_ids, swap_backward_input_mask)
            swap_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_foward_embedding_output,
                attention_mask=swap_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            swap_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_backward_embedding_output,
                attention_mask=swap_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # # generic detection
            # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids,
            #                                                                                 generic_forward_input_mask)
            # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids,
            #                                                                                  generic_backward_input_mask)
            # generic_forward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_foward_embedding_output,
            #     attention_mask=generic_forward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)
            # generic_backward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_backward_embedding_output,
            #     attention_mask=generic_backward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)

            # nli detection
            nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_forward_input_ids, nli_forward_input_mask)
            nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_backward_input_ids, nli_backward_input_mask)
            nli_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_foward_embedding_output,
                attention_mask=nli_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            nli_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_backward_embedding_output,
                attention_mask=nli_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        random_forward_embedding = random_forward_all_encoder_layers[-2]
        random_backward_embedding = random_backward_all_encoder_layers[-2]
        swap_forward_embedding = swap_forward_all_encoder_layers[-2]
        swap_backward_embedding = swap_backward_all_encoder_layers[-2]
        # generic_forward_embedding = generic_forward_all_encoder_layers[-2]
        # generic_backward_embedding = generic_backward_all_encoder_layers[-2]
        nli_forward_embedding = nli_forward_all_encoder_layers[-2]
        nli_backward_embedding = nli_backward_all_encoder_layers[-2]
        response_embedding = response_all_encoder_layers[-2]

    response_embedding_shape = modeling.get_shape_list(response_embedding,
                                                       expected_rank=3)
    with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE):

        response_logits = tf.layers.dense(response_embedding,
                                          config.hidden_size,
                                          activation=None)
        response_logits = modeling.gelu(response_logits)
        response_logits = modeling.layer_norm(response_logits)
        response_outputs = tf.layers.dense(
            response_logits,
            config.vocab_size,
            activation=None,
            use_bias=True,
            bias_initializer=tf.zeros_initializer())

        response_one_hot = tf.one_hot(response_labels,
                                      depth=config.vocab_size,
                                      dtype=tf.float32)

        lm_cost = tf.nn.softmax_cross_entropy_with_logits(
            labels=response_one_hot, logits=response_outputs)

        sequence_mask = tf.sequence_mask(response_text_len,
                                         maxlen=response_embedding_shape[1],
                                         dtype=tf.float32)

        masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask)

        final_lm_loss = tf.reduce_mean(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

        perplexity = tf.exp(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

    random_forward_embedding_shape = modeling.get_shape_list(
        random_forward_embedding, expected_rank=3)
    random_backward_embedding_shape = modeling.get_shape_list(
        random_backward_embedding, expected_rank=3)
    assert random_forward_embedding_shape[
        2] == random_backward_embedding_shape[2]
    random_forward_embedding = tf.transpose(random_forward_embedding,
                                            [1, 0, 2])
    random_backward_embedding = tf.transpose(random_backward_embedding,
                                             [1, 0, 2])
    random_forward_input_mask = tf.cast(
        tf.transpose(random_forward_input_mask, [1, 0]), tf.float32)
    random_backward_input_mask = tf.cast(
        tf.transpose(random_backward_input_mask, [1, 0]), tf.float32)

    swap_forward_embedding_shape = modeling.get_shape_list(
        swap_forward_embedding, expected_rank=3)
    swap_backward_embedding_shape = modeling.get_shape_list(
        swap_backward_embedding, expected_rank=3)
    assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2]
    swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2])
    swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2])
    swap_forward_input_mask = tf.cast(
        tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32)
    swap_backward_input_mask = tf.cast(
        tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32)

    # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3)
    # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3)
    # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2]
    # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2])
    # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2])
    # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32)
    # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32)

    nli_forward_embedding_shape = modeling.get_shape_list(
        nli_forward_embedding, expected_rank=3)
    nli_backward_embedding_shape = modeling.get_shape_list(
        nli_backward_embedding, expected_rank=3)
    assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2]
    nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2])
    nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2])
    nli_forward_input_mask = tf.cast(
        tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32)
    nli_backward_input_mask = tf.cast(
        tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32)

    model = HadeModel(
        x_random_forward=random_forward_embedding,
        x_random_mask_forward=random_forward_input_mask,
        x_random_length_forward=random_forward_text_len,
        x_random_backward=random_backward_embedding,
        x_random_mask_backward=random_backward_input_mask,
        x_random_length_backward=random_backward_text_len,
        y_random=random_labels,
        x_swap_forward=swap_forward_embedding,
        x_swap_mask_forward=swap_forward_input_mask,
        x_swap_length_forward=swap_forward_text_len,
        x_swap_backward=swap_backward_embedding,
        x_swap_mask_backward=swap_backward_input_mask,
        x_swap_length_backward=swap_backward_text_len,
        y_swap=swap_labels,
        # x_generic_forward=generic_forward_embedding,
        # x_generic_mask_forward=generic_forward_input_mask,
        # x_generic_length_forward=generic_forward_text_len,
        # x_generic_backward=generic_backward_embedding,
        # x_generic_mask_backward=generic_backward_input_mask,
        # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels,
        x_nli_forward=nli_forward_embedding,
        x_nli_mask_forward=nli_forward_input_mask,
        x_nli_length_forward=nli_forward_text_len,
        x_nli_backward=nli_backward_embedding,
        x_nli_mask_backward=nli_backward_input_mask,
        x_nli_length_backward=nli_backward_text_len,
        y_nli=nli_labels,
        embedding_dim=random_forward_embedding_shape[2],
        num_nli_labels=num_nli_labels,
        hidden_size=lstm_size,
        l2_reg_lambda=l2_reg_lambda,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
        is_training=is_training)

    random_prob, swap_prob, nli_prob, total_cost = model.create_model()

    return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity