Example #1
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

    return logits
def encode_block(bert_config, input_ids, input_masks, segment_ids,
                 use_one_hot_embeddings, num_vec, is_training):
    """Encode text and get multi-vector representations."""
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_masks,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            scope="bert")

    emb_dim = bert_config.hidden_size
    output_layer, mask = get_multi_vectors(model, input_masks, num_vec)
    # [batch_size, num_vec, hidden_size], [batch_size, num_vec]

    output_layer.set_shape([None, None, emb_dim])

    if FLAGS.projection_size > 0:
        with tf.variable_scope("projected_layer", reuse=tf.AUTO_REUSE):
            output_layer = tf.layers.dense(output_layer, FLAGS.projection_size)

        emb_dim = FLAGS.projection_size
        output_layer.set_shape([None, None, emb_dim])

    if FLAGS.layer_norm:
        output_layer = modeling.layer_norm(output_layer)
    else:
        output_layer = tf.math.l2_normalize(output_layer, axis=-1)
    return output_layer, mask
Example #3
0
 def __init__(
     self,
 ):
     self.X = tf.placeholder(tf.int32, [None, None])
     
     model = modeling.BertModel(
         config=bert_config,
         is_training=False,
         input_ids=self.X,
         use_one_hot_embeddings=False)
     
     output_layer = model.get_sequence_output()
     embedding = model.get_embedding_table()
     
     with tf.variable_scope('cls/predictions'):
         with tf.variable_scope('transform'):
             input_tensor = tf.layers.dense(
                 output_layer,
                 units = bert_config.hidden_size,
                 activation = modeling.get_activation(bert_config.hidden_act),
                 kernel_initializer = modeling.create_initializer(
                     bert_config.initializer_range
                 ),
             )
             input_tensor = modeling.layer_norm(input_tensor)
         
         output_bias = tf.get_variable(
         'output_bias',
         shape = [bert_config.vocab_size],
         initializer = tf.zeros_initializer(),
         )
         logits = tf.matmul(input_tensor, embedding, transpose_b = True)
         self.logits = tf.nn.bias_add(logits, output_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs_student = tf.nn.log_softmax(logits, axis=-1)
    probs_student = tf.nn.softmax(logits, axis=-1)

    prob_shape = tf.shape(log_probs_student)
    new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor]

    top_k_indices = tf.reshape(top_k_indices, new_shape)
    top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices)
    top_k_probs_student = tf.batch_gather(probs_student, top_k_indices)

    return top_k_log_probs_student, top_k_probs_student
Example #5
0
    def get_masked_lm_output(self):
        self.input_tensor = self.gather_indexes()
        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                self.input_tensor = tf.layers.dense(
                    self.input_tensor,
                    units=self.bert_config.hidden_size,
                    activation=modeling.get_activation(
                        self.bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self.bert_config.initializer_range))
                self.input_tensor = modeling.layer_norm(self.input_tensor)
            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[self.bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(self.input_tensor,
                               self.output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            flat_masked_lm_ids = tf.reshape(self.masked_lm_ids, [-1])
            one_hot_labels = tf.one_hot(flat_masked_lm_ids,
                                        depth=self.bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])

            # TODO: dynamic gather from per_example_loss???
            loss = tf.reshape(per_example_loss,
                              [-1, tf.shape(self.masked_lm_positions)[1]])
            return loss
Example #6
0
 def mlp(net, weights, biases):
     for i, (w, b) in enumerate(zip(weights, biases)):
         dropout_rate = float(expt_flags.get('mlp_dropout_rate', 0.0))
         if dropout_rate > 0.0 and is_training:
             net = modeling.dropout(net, dropout_rate)
         if eval(expt_flags.get('mlp_layer_norm', 'False')):
             net = modeling.layer_norm(net)
         net = tf.nn.bias_add(tf.matmul(net, w, transpose_b=True), b)
         if i < len(weights) - 1:
             net = modeling.gelu(net)
     return net
Example #7
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_weights, truncated_masked_lm_probs_teacher,
                         top_k_indices, truncation_factor):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs_student = tf.nn.log_softmax(logits, axis=-1)

        label_weights = tf.reshape(label_weights, [-1])

        prob_shape = tf.shape(log_probs_student)
        new_shape = [prob_shape[0], truncation_factor
                     ]  #[batch_size*seq_len,truncation_factor]

        top_k_indices = tf.reshape(top_k_indices, new_shape)
        top_k_log_probs_student = tf.batch_gather(log_probs_student,
                                                  top_k_indices)

        truncated_masked_lm_probs_teacher = tf.reshape(
            truncated_masked_lm_probs_teacher, new_shape)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            truncated_masked_lm_probs_teacher * top_k_log_probs_student,
            axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs_student)
Example #8
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights_flat = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])

        numerator = tf.reduce_sum(label_weights_flat * per_example_loss)
        denominator = tf.reduce_sum(label_weights_flat) + 1e-5
        loss = numerator / denominator

        batch_size = tf.cast(tf.shape(label_weights)[0], tf.float32)
        print('==============')
        print(label_weights.shape)
        print('==============')
        loss = batch_size * loss

    return (loss, per_example_loss, log_probs)
Example #9
0
        def forward(x, segment, masks, y, reuse=False, config=bert_config):
            with tf.variable_scope('bert', reuse=reuse):
                model = modeling.BertModel(
                    config=config,
                    is_training=training,
                    input_ids=x,
                    input_mask=masks,
                    token_type_ids=segment,
                    use_one_hot_embeddings=False,
                )
                memory = model.get_sequence_output()
            with tf.variable_scope('bert', reuse=True):
                Y_seq_len = tf.count_nonzero(y, 1, dtype=tf.int32)
                y_masks = tf.sequence_mask(Y_seq_len,
                                           tf.reduce_max(Y_seq_len),
                                           dtype=tf.float32)

                model = modeling_decoder.BertModel(
                    config=config,
                    is_training=training,
                    input_ids=y,
                    input_mask=y_masks,
                    memory=memory,
                    memory_mask=masks,
                    use_one_hot_embeddings=False,
                )
                output_layer = model.get_sequence_output()
                embedding = model.get_embedding_table()

            with tf.variable_scope('cls/predictions', reuse=reuse):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        output_layer,
                        units=config.hidden_size,
                        activation=modeling.get_activation(
                            bert_config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            bert_config.initializer_range),
                    )
                input_tensor = modeling.layer_norm(input_tensor)

                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[bert_config.vocab_size],
                    initializer=tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor, embedding, transpose_b=True)
                return logits
    def bert_module_fn(is_training):
        """Spec function for a token embedding module."""

        input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")

        bert_config = modeling.BertConfig.from_json_file(config_path)
        model = modeling.BertModel(config=bert_config, is_training=is_training,
                                   input_ids=input_ids)

        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

        config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
        vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
        lower_case = tf.constant(do_lower_case)

        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
        tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)

        input_map = {"input_ids": input_ids}

        output_map = {"logits": logits}

        output_info_map = {"vocab_file": vocab_file,
                           "do_lower_case": lower_case}

        hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
        hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
 def compute_logits(self, target_emb, reuse=None):
     """Compute logits for word prediction."""
     with tf.variable_scope(self.scope_prefix + "cls/predictions",
                            reuse=reuse):
         with tf.variable_scope("transform"):
             target_emb = tf.layers.dense(
                 target_emb,
                 units=self.config.hidden_size,
                 activation=modeling.get_activation(self.config.hidden_act),
                 kernel_initializer=self.initializer)
             target_emb = modeling.layer_norm(target_emb)
         output_bias = tf.get_variable("output_bias",
                                       shape=[self.config.vocab_size],
                                       initializer=tf.zeros_initializer())
     logits = tf.matmul(target_emb, self.embedding_table, transpose_b=True)
     logits = tf.nn.bias_add(logits, output_bias)
     return logits
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids):
    """Get loss and log probs for the masked LM."""
    print("input tensor before gather_indexes:", input_tensor)
    input_tensor = gather_indexes(input_tensor, positions)
    print("input tensor before gather_indexes:", input_tensor)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        print(label_ids)
        label_ids = tf.reshape(label_ids, [-1])
        print(label_ids)
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        print(one_hot_labels)
        print(log_probs)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        print(per_example_loss)

        loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        print('positions: ', positions)
        print('loss', loss)
        # TODO: dynamic gather from per_example_loss???
    return loss
Example #13
0
    def __init__(self, config, input_hidden, embedding_table):
        # Keep variable names the same as BERT
        with tf.variable_scope("cls"):
            with tf.variable_scope("predictions"):
                with tf.variable_scope("transform"):
                    self.transformed_output = tf.layers.dense(
                        input_hidden,
                        config.hidden_size,
                        activation=modeling.get_activation(config.hidden_act),
                        kernel_initializer=modeling.create_initializer(
                            config.initializer_range))
                    self.transformed_output = modeling.layer_norm(
                        self.transformed_output)

                output_bias = tf.Variable(tf.zeros([config.vocab_size]),
                                          name="output_bias")
                self.final_output = tf.add(
                    tf.matmul(self.transformed_output,
                              tf.transpose(embedding_table)), output_bias)
                self.probs = tf.nn.softmax(self.final_output,
                                           name='token_probs')
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         truncation_factor):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        masked_lm_probs = tf.nn.softmax(logits, axis=-1)
        trunc_masked_lm_probs, top_indices = tf.math.top_k(masked_lm_probs,
                                                           k=truncation_factor,
                                                           sorted=False)

        max_predictions_per_seq = positions.get_shape().as_list()[1]
        truncation_factor_ = top_indices.get_shape().as_list()[1]

        trunc_masked_lm_probs = tf.reshape(
            trunc_masked_lm_probs,
            [-1, max_predictions_per_seq, truncation_factor_])
        top_indices = tf.reshape(
            top_indices, [-1, max_predictions_per_seq, truncation_factor_])
    return trunc_masked_lm_probs, top_indices
Example #15
0
    def __init__(
        self,
    ):
        BERT_CONFIG = "PATH_TO/multi_cased_L-12_H-768_A-12/bert_config.json"
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
        self.X = tf.placeholder(tf.int32, [None, None])
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False,
        )

        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()

        output_layer = tf.reshape(output_layer, [-1, bert_config.hidden_size])
        with tf.variable_scope("cls/predictions"):
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                "output_bias",
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b=True)
            print("---")
            self.logits = tf.nn.bias_add(logits, output_bias)
Example #16
0
    def __init__(
        self,
        bert_config,
        input_ids,
        input_mask,
        token_type_ids,
        Y,
        is_training=True,
    ):
        self.X = input_ids
        self.segment_ids = token_type_ids
        self.input_masks = input_mask
        self.Y = Y
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False,
        )

        print(bert_config.__dict__)

        BASE_PARAMS = defaultdict(
            lambda: None,
            default_batch_size=2048,
            default_batch_size_tpu=32768,
            max_length=bert_config.max_position_embeddings,
            initializer_gain=1.0,
            vocab_size=bert_config.vocab_size,
            hidden_size=bert_config.hidden_size,
            num_hidden_layers=bert_config.num_hidden_layers,
            num_heads=bert_config.num_attention_heads,
            filter_size=bert_config.intermediate_size,
            layer_postprocess_dropout=0.1,
            attention_dropout=0.1,
            relu_dropout=0.1,
            label_smoothing=0.1,
            learning_rate=1.0,
            learning_rate_decay_rate=1.0,
            learning_rate_warmup_steps=16000,
            optimizer_adam_beta1=0.9,
            optimizer_adam_beta2=0.997,
            optimizer_adam_epsilon=1e-09,
            extra_decode_length=50,
            beam_size=4,
            alpha=0.6,
            use_tpu=False,
            static_batch=False,
            allow_ffn_pad=True,
        )

        self.decoder_stack = DecoderStack(BASE_PARAMS, is_training)
        attention_bias = model_utils.get_padding_bias(self.X)

        output_layer = model.get_sequence_output()
        pooled_output = model.get_pooled_output()
        embedding = model.get_embedding_table()

        with tf.name_scope('decode'):
            mask = tf.to_float(tf.not_equal(self.Y, 0))
            decoder_inputs = tf.gather(embedding, self.Y)
            decoder_inputs *= tf.expand_dims(mask, -1)
            with tf.name_scope('shift_targets'):
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_encoding'):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, BASE_PARAMS['hidden_size'])
            if is_training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - BASE_PARAMS['layer_postprocess_dropout'])
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(
                decoder_inputs,
                output_layer,
                decoder_self_attention_bias,
                attention_bias,
            )

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    outputs,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range),
                )
            input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            self.training_logits = tf.matmul(input_tensor,
                                             embedding,
                                             transpose_b=True)

        print(self.training_logits)
Example #17
0
    def __init__(self, bert_config, tokenizer, cls, sep):
        _graph = tf.Graph()
        with _graph.as_default():
            self.X = tf.placeholder(tf.int32, [None, None])
            self.top_p = tf.placeholder(tf.float32, None)
            self.top_k = tf.placeholder(tf.int32, None)
            self.k = tf.placeholder(tf.int32, None)
            self.temperature = tf.placeholder(tf.float32, None)
            self.indices = tf.placeholder(tf.int32, [None, None])
            self._tokenizer = tokenizer
            self._cls = cls
            self._sep = sep

            self.model = modeling.BertModel(
                config = bert_config,
                is_training = False,
                input_ids = self.X,
                use_one_hot_embeddings = False,
            )
            self.logits = self.model.get_pooled_output()
            output_layer = self.model.get_sequence_output()
            embedding = self.model.get_embedding_table()

            with tf.variable_scope('cls/predictions'):
                with tf.variable_scope('transform'):
                    input_tensor = tf.layers.dense(
                        output_layer,
                        units = bert_config.hidden_size,
                        activation = modeling.get_activation(
                            bert_config.hidden_act
                        ),
                        kernel_initializer = modeling.create_initializer(
                            bert_config.initializer_range
                        ),
                    )
                    input_tensor = modeling.layer_norm(input_tensor)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape = [bert_config.vocab_size],
                    initializer = tf.zeros_initializer(),
                )
                logits = tf.matmul(input_tensor, embedding, transpose_b = True)
                self._logits = tf.nn.bias_add(logits, output_bias)
                self._log_softmax = tf.nn.log_softmax(self._logits)

            logits = tf.gather_nd(self._logits, self.indices)
            logits = logits / self.temperature

            def necleus():
                return top_p_logits(logits, self.top_p)

            def select_k():
                return top_k_logits(logits, self.top_k)

            logits = tf.cond(self.top_p > 0, necleus, select_k)
            self.samples = tf.multinomial(
                logits, num_samples = self.k, output_dtype = tf.int32
            )

            self._sess = tf.InteractiveSession()
            self._sess.run(tf.global_variables_initializer())
            var_lists = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert'
            )
            cls = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls'
            )
            self._saver = tf.train.Saver(var_list = var_lists + cls)
            attns = _extract_attention_weights(
                bert_config.num_hidden_layers, tf.get_default_graph()
            )
            self.attns = attns
Example #18
0
def create_model(bert_config,
                 is_training,
                 fewshot_num_examples_per_class,
                 input_ids,
                 input_mask,
                 segment_ids,
                 use_one_hot_embeddings,
                 tokenizer=None,
                 class_examples_combiner="max"):
    """Creates a classification model."""
    if not is_training:
        bert_config.hidden_dropout_prob = 0.0
        bert_config.attention_probs_dropout_prob = 0.0

    # unroll fewshot batches to extract BERT representations.
    fewshot_size = input_ids.shape[1].value
    sequence_length = input_ids.shape[2].value

    bert_input_ids = tf.reshape(input_ids, [-1, sequence_length])
    bert_input_mask = tf.reshape(input_mask, [-1, sequence_length])
    bert_segment_ids = tf.reshape(segment_ids, [-1, sequence_length])
    tf.logging.info(
        "shapes %s %s %s" %
        (bert_input_ids.shape, bert_input_mask.shape, bert_segment_ids.shape))

    model = modeling.BertModel(
        config=bert_config,
        is_training=FLAGS.train_bert_model if is_training else False,
        input_ids=bert_input_ids,
        input_mask=bert_input_mask,
        token_type_ids=bert_segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    # [batch_size, fewshot_size * seq_len, hidden_size]
    output_layer = model.get_sequence_output()
    tf.logging.info("BERT model output shape %s", output_layer.shape)

    # The "pooler" converts the encoded sequence tensor of shape
    # [batch_size, seq_length, hidden_size] to a tensor of shape
    # [batch_size, 2*hidden_size].
    with tf.variable_scope("cls/entity_relation"):
        # [batch_size, fewshot_size, 2 * hidden_size]
        output_layer = extract_relation_representations(
            output_layer, bert_input_ids, tokenizer)
        output_layer = modeling.layer_norm(output_layer)

    def _combine_multi_example_logits(logits):
        """Combine per-example logits into a per-class logit."""
        logits = tf.reshape(
            logits,
            [-1, fewshot_num_classes, fewshot_num_examples_per_class, 1])
        if class_examples_combiner == "max":
            logits = tf.reduce_max(logits, axis=2)
        if class_examples_combiner == "mean":
            logits = tf.reduce_mean(logits, axis=2)
        if class_examples_combiner == "logsumexp":
            logits = tf.reduce_logsumexp(logits, axis=2)
        if class_examples_combiner == "min":
            logits = tf.reduce_min(logits, axis=2)
        if class_examples_combiner == "sigmoid_mean":
            logits = tf.sigmoid(logits)
            logits = tf.reduce_mean(logits, axis=2)
        return logits

    fewshot_num_classes = int(
        (fewshot_size - 1) / fewshot_num_examples_per_class)
    hidden_size = output_layer.shape[-1].value
    with tf.variable_scope("loss"):
        # [batch_size, fewshot_size, hidden_size]
        output_weights = tf.reshape(output_layer,
                                    [-1, fewshot_size, hidden_size])

        # Extract query representation from output.
        # [batch_size, fewshot_size - 1, hidden_size]
        output_layer = tf.reshape(output_weights[:, 0, :],
                                  [-1, 1, hidden_size])

        # Remove query from targets.
        # [batch_size, 1, hidden_size]
        output_weights = output_weights[:, 1:, :]

        # Dot product based distance metric.
        # [batch_size, fewshot_size - 1, 1]
        logits = tf.matmul(output_layer, output_weights, transpose_b=True)

        if fewshot_num_examples_per_class > 1:
            # [batch_size, fewshot_num_classes, 1]
            logits = _combine_multi_example_logits(logits)

        # [batch_size, fewshot_num_classes]
        logits = tf.reshape(logits, [-1, fewshot_num_classes])

    return logits
Example #19
0
  def build_attn_layers(self,
                        input_tensor,
                        attn_mask_concat,
                        intermediate_size=2048,
                        intermediate_act_fn=modeling.gelu,
                        hidden_dropout_prob=0.1,
                        attention_probs_dropout_prob=0.1,
                        initializer_range=0.02,
                        do_return_all_layers=False):
    """See `attention_layer` defined in `bert/modeling.py`"""
    if not self.is_training:
      hidden_dropout_prob = 0.0
      attention_probs_dropout_prob = 0.0

    # input tensor shape: [batch, arg_length, BERT_hidden_size]
    # for example, using default hparams vals: [64, 128, 768]
    attention_head_size = int(self.hidden_size / self.num_attention_heads)
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    prev_output = input_tensor

    attention_type_split = self.attention_type.split("_")

    all_layer_outputs = []
    for layer_idx in range(self.num_hidden_layers):
      with tf.variable_scope(f"layer_{layer_idx}"):
        layer_input = prev_output

        if len(attention_type_split) == 3:
          indexer = layer_idx % 2
        else:  # len(attention_type_split) == 2:
          indexer = 0
        layer_attn_type = attention_type_split[indexer]

        tf.logging.info(
          f"{layer_attn_type.capitalize()} Attention at {layer_idx}th Layer")

        attention_heads = []
        with tf.variable_scope(f"{layer_attn_type}_attn"):
          attention_head = self.build_attn_layer(
            input_tensor=input_tensor,
            attn_mask_concat=attn_mask_concat,
            layer_attn_type=layer_attn_type,
            num_attention_heads=self.num_attention_heads,
            size_per_head=attention_head_size,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            initializer_range=initializer_range,
            do_return_2d_tensor=False
          )

          attention_heads.append(attention_head)

          attention_output = None
          if len(attention_heads) == 1:
            attention_output = attention_heads[0]
          else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)

          # Run a linear projection of `hidden_size` then add a residual
          # with `layer_input`.
          with tf.variable_scope("output"):
            attention_output = tf.layers.dense(
              attention_output,
              self.hidden_size,
              kernel_initializer=modeling.create_initializer(initializer_range))
            attention_output = modeling.dropout(attention_output,
                                                hidden_dropout_prob)
            attention_output = modeling.layer_norm(attention_output + layer_input)

        # The activation is only applied to the "intermediate" hidden layer.
        with tf.variable_scope("intermediate"):
          intermediate_output = tf.layers.dense(
            attention_output,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=modeling.create_initializer(initializer_range))

        # Down-project back to `hidden_size` then add the residual.
        with tf.variable_scope("output"):
          layer_output = tf.layers.dense(
            intermediate_output,
            self.hidden_size,
            kernel_initializer=modeling.create_initializer(initializer_range))
          layer_output = modeling.dropout(layer_output, hidden_dropout_prob)
          layer_output = modeling.layer_norm(layer_output + attention_output)
          prev_output = layer_output
          all_layer_outputs.append(layer_output)

    if do_return_all_layers:
      final_outputs = []
      for layer_output in all_layer_outputs:
        final_output = modeling.reshape_from_matrix(layer_output, input_shape)
        final_outputs.append(final_output)
      return final_outputs
    else:
      final_output = modeling.reshape_from_matrix(prev_output, input_shape)
      return final_output
Example #20
0
    def __call__(self, features, hidden_feature, mode, problem_name):
        """Get loss and log probs for the masked LM.

        DO NOT CHANGE THE VARAIBLE SCOPE.
        """
        seq_hidden_feature = hidden_feature['seq']
        positions = features['masked_lm_positions']
        input_tensor = gather_indexes(seq_hidden_feature, positions)
        output_weights = hidden_feature['embed_table']
        label_ids = features['masked_lm_ids']
        label_weights = features['masked_lm_weights']

        with tf.variable_scope("cls/predictions"):
            # We apply one more non-linear transformation before the output layer.
            # This matrix is not used after pre-training.
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=self.params.mask_lm_hidden_size,
                    activation=modeling.get_activation(
                        self.params.mask_lm_hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        self.params.mask_lm_initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)

            # The output weights are the same as the input embeddings, but there is
            # an output-only bias for each token.
            output_bias = tf.get_variable("output_bias",
                                          shape=[self.params.vocab_size],
                                          initializer=tf.zeros_initializer())

            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            self.logits = logits
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            if mode == tf.estimator.ModeKeys.PREDICT:
                self.prob = log_probs
                return self.prob

            else:

                label_ids = tf.reshape(label_ids, [-1])
                label_weights = tf.reshape(label_weights, [-1])

                one_hot_labels = tf.one_hot(label_ids,
                                            depth=self.params.vocab_size,
                                            dtype=tf.float32)

                # The `positions` tensor might be zero-padded (if the sequence is too
                # short to have the maximum number of predictions). The `label_weights`
                # tensor has a value of 1.0 for every real prediction and 0.0 for the
                # padding predictions.
                per_example_loss = - \
                    tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
                numerator = tf.reduce_sum(label_weights * per_example_loss)
                denominator = tf.reduce_sum(label_weights) + 1e-5
                loss = numerator / denominator

                if mode == tf.estimator.ModeKeys.TRAIN:
                    self.loss = loss
                    return self.loss

                else:

                    def metric_fn(masked_lm_example_loss, masked_lm_log_probs,
                                  masked_lm_ids, masked_lm_weights):
                        """Computes the loss and accuracy of the model."""
                        masked_lm_log_probs = tf.reshape(
                            masked_lm_log_probs,
                            [-1, masked_lm_log_probs.shape[-1]])
                        masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                          axis=-1,
                                                          output_type=tf.int32)
                        masked_lm_example_loss = tf.reshape(
                            masked_lm_example_loss, [-1])
                        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                        masked_lm_accuracy = tf.metrics.accuracy(
                            labels=masked_lm_ids,
                            predictions=masked_lm_predictions,
                            weights=masked_lm_weights)
                        masked_lm_mean_loss = tf.metrics.mean(
                            values=masked_lm_example_loss,
                            weights=masked_lm_weights)

                        return {
                            "masked_lm_accuracy": masked_lm_accuracy,
                            "masked_lm_loss": masked_lm_mean_loss,
                        }

                    eval_metrics = (metric_fn(per_example_loss, log_probs,
                                              label_ids, label_weights), loss)

                    self.eval_metrics = eval_metrics
                    return self.eval_metrics
def compute_transformer(
    input_tensor,
    attention_mask,
    hidden_size,
    num_hidden_layers,
    num_attention_heads,
    intermediate_size,
    intermediate_act_fn,
    hidden_dropout_prob,
    attention_probs_dropout_prob,
    initializer_range,
    input_cache,
):
    """Multi-headed, multi-layer Transformer."""
    attention_mask = tf.cast(attention_mask, tf.float32)
    attention_head_size = int(hidden_size / num_attention_heads)
    prev_output = input_tensor
    if input_cache is not None:
        input_cache = TransformerCache(keys=tf.unstack(input_cache.keys,
                                                       axis=2),
                                       values=tf.unstack(input_cache.values,
                                                         axis=2))
    output_cache = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output
            with tf.variable_scope("attention"):
                with tf.variable_scope("self"):
                    if input_cache is not None:
                        layer_input_cache = TransformerCache(
                            keys=input_cache.keys[layer_idx],
                            values=input_cache.values[layer_idx])
                    else:
                        layer_input_cache = None
                    attention_output, layer_output_cache = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        input_cache=layer_input_cache)
                    output_cache.append(layer_output_cache)
                with tf.variable_scope("output"):
                    attention_output = dense_layer_3d_proj(
                        attention_output, hidden_size, num_attention_heads,
                        attention_head_size,
                        modeling.create_initializer(initializer_range), None,
                        "dense")
                    attention_output = modeling.dropout(
                        attention_output, hidden_dropout_prob)
                    attention_output = modeling.layer_norm(attention_output +
                                                           layer_input)
            with tf.variable_scope("intermediate"):
                intermediate_output = dense_layer_2d(
                    attention_output, intermediate_size,
                    modeling.create_initializer(initializer_range),
                    intermediate_act_fn, "dense")
            with tf.variable_scope("output"):
                layer_output = dense_layer_2d(
                    intermediate_output, hidden_size,
                    modeling.create_initializer(initializer_range), None,
                    "dense")
                layer_output = modeling.dropout(layer_output,
                                                hidden_dropout_prob)
                layer_output = modeling.layer_norm(layer_output +
                                                   attention_output)
                prev_output = layer_output

    # [batch_size, seq_len, num_layers, num_heads, head_size]
    output_cache = TransformerCache(
        keys=tf.stack([c.keys for c in output_cache], 2),
        values=tf.stack([c.values for c in output_cache], 2))
    return prev_output, output_cache
    def __init__(
        self,
        input_ids,
        input_mask,
        token_type_ids,
        Y,
        learning_rate=2e-5,
        is_training=True,
    ):
        self.X = input_ids
        self.segment_ids = token_type_ids
        self.input_masks = input_mask
        self.Y = Y
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=self.X,
            input_mask=self.input_masks,
            token_type_ids=self.segment_ids,
            use_one_hot_embeddings=False,
        )

        self.decoder_stack = DecoderStack(BASE_PARAMS, is_training)
        attention_bias = model_utils.get_padding_bias(self.X)

        output_layer = model.get_sequence_output()
        pooled_output = model.get_pooled_output()
        embedding = model.get_embedding_table()

        with tf.name_scope('decode'):
            mask = tf.to_float(tf.not_equal(self.Y, 0))
            decoder_inputs = tf.gather(embedding, self.Y)
            decoder_inputs *= tf.expand_dims(mask, -1)
            with tf.name_scope('shift_targets'):
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_encoding'):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, BASE_PARAMS['hidden_size'])
            if is_training:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - BASE_PARAMS['layer_postprocess_dropout'])
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(
                decoder_inputs,
                output_layer,
                decoder_self_attention_bias,
                attention_bias,
            )

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    outputs,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range),
                )
            input_tensor = modeling.layer_norm(input_tensor)

            output_bias = tf.get_variable(
                'output_bias',
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer(),
            )
            self.training_logits = tf.matmul(input_tensor,
                                             embedding,
                                             transpose_b=True)

        print(self.training_logits)
Example #23
0
def create_bilstm_classification_model(bert_config,
                                       is_training,
                                       response_input_ids,
                                       response_input_mask,
                                       response_segment_ids,
                                       response_text_len,
                                       response_labels,
                                       random_forward_input_ids,
                                       random_forward_input_mask,
                                       random_forward_segment_ids,
                                       random_forward_text_len,
                                       random_backward_input_ids,
                                       random_backward_input_mask,
                                       random_backward_segment_ids,
                                       random_backward_text_len,
                                       random_labels,
                                       swap_forward_input_ids,
                                       swap_forward_input_mask,
                                       swap_forward_segment_ids,
                                       swap_forward_text_len,
                                       swap_backward_input_ids,
                                       swap_backward_input_mask,
                                       swap_backward_segment_ids,
                                       swap_backward_text_len,
                                       swap_labels,
                                       nli_forward_input_ids,
                                       nli_forward_input_mask,
                                       nli_forward_segment_ids,
                                       nli_forward_text_len,
                                       nli_backward_input_ids,
                                       nli_backward_input_mask,
                                       nli_backward_segment_ids,
                                       nli_backward_text_len,
                                       nli_labels,
                                       num_nli_labels,
                                       use_one_hot_embeddings,
                                       l2_reg_lambda=0.1,
                                       dropout_rate=1.0,
                                       lstm_size=None,
                                       num_layers=1):

    config = copy.deepcopy(bert_config)

    if not is_training:
        config.hidden_dropout_prob = 0.0
        config.attention_probs_dropout_prob = 0.0

    with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):

        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            (response_embedding_output,
             response_embedding_table) = modeling.embedding_lookup(
                 input_ids=response_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            response_embedding_output = modeling.embedding_postprocessor(
                input_tensor=response_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=response_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # random detection
            # Perform embedding lookup on the word ids.
            (random_foward_embedding_output,
             random_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Perform embedding lookup on the word ids.
            (random_backward_embedding_output,
             random_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Add positional embeddings and token type embeddings, then layer
            # normalize and perform dropout.
            random_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            random_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # swap detection
            (swap_foward_embedding_output,
             swap_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            (swap_backward_embedding_output,
             swap_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            swap_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            swap_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # # generic detection
            # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_forward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_backward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # generic_foward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_foward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_forward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)
            # generic_backward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_backward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_backward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)

            # nli detection
            (nli_foward_embedding_output,
             nli_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            (nli_backward_embedding_output,
             nli_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            nli_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            nli_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            response_attention_mask = modeling.create_attention_mask_from_input_mask(
                response_input_ids, response_input_mask)
            # [batch_size, from_seq_length, to_seq_length]
            # mask future tokens
            diag_vals = tf.ones_like(response_attention_mask[0, :, :])
            tril = tf.linalg.LinearOperatorLowerTriangular(
                diag_vals).to_dense()
            future_masks = tf.tile(tf.expand_dims(
                tril, 0), [tf.shape(response_attention_mask)[0], 1, 1])
            response_attention_mask = tf.math.multiply(response_attention_mask,
                                                       future_masks)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            response_all_encoder_layers = modeling.transformer_model(
                input_tensor=response_embedding_output,
                attention_mask=response_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # random detection
            # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
            # mask of shape [batch_size, seq_length, seq_length] which is used
            # for the attention scores.
            random_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_forward_input_ids, random_forward_input_mask)
            random_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_backward_input_ids, random_backward_input_mask)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            random_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_foward_embedding_output,
                attention_mask=random_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            random_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_backward_embedding_output,
                attention_mask=random_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # swap detection
            swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_forward_input_ids, swap_forward_input_mask)
            swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_backward_input_ids, swap_backward_input_mask)
            swap_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_foward_embedding_output,
                attention_mask=swap_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            swap_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_backward_embedding_output,
                attention_mask=swap_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # # generic detection
            # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids,
            #                                                                                 generic_forward_input_mask)
            # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids,
            #                                                                                  generic_backward_input_mask)
            # generic_forward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_foward_embedding_output,
            #     attention_mask=generic_forward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)
            # generic_backward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_backward_embedding_output,
            #     attention_mask=generic_backward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)

            # nli detection
            nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_forward_input_ids, nli_forward_input_mask)
            nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_backward_input_ids, nli_backward_input_mask)
            nli_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_foward_embedding_output,
                attention_mask=nli_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            nli_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_backward_embedding_output,
                attention_mask=nli_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        random_forward_embedding = random_forward_all_encoder_layers[-2]
        random_backward_embedding = random_backward_all_encoder_layers[-2]
        swap_forward_embedding = swap_forward_all_encoder_layers[-2]
        swap_backward_embedding = swap_backward_all_encoder_layers[-2]
        # generic_forward_embedding = generic_forward_all_encoder_layers[-2]
        # generic_backward_embedding = generic_backward_all_encoder_layers[-2]
        nli_forward_embedding = nli_forward_all_encoder_layers[-2]
        nli_backward_embedding = nli_backward_all_encoder_layers[-2]
        response_embedding = response_all_encoder_layers[-2]

    response_embedding_shape = modeling.get_shape_list(response_embedding,
                                                       expected_rank=3)
    with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE):

        response_logits = tf.layers.dense(response_embedding,
                                          config.hidden_size,
                                          activation=None)
        response_logits = modeling.gelu(response_logits)
        response_logits = modeling.layer_norm(response_logits)
        response_outputs = tf.layers.dense(
            response_logits,
            config.vocab_size,
            activation=None,
            use_bias=True,
            bias_initializer=tf.zeros_initializer())

        response_one_hot = tf.one_hot(response_labels,
                                      depth=config.vocab_size,
                                      dtype=tf.float32)

        lm_cost = tf.nn.softmax_cross_entropy_with_logits(
            labels=response_one_hot, logits=response_outputs)

        sequence_mask = tf.sequence_mask(response_text_len,
                                         maxlen=response_embedding_shape[1],
                                         dtype=tf.float32)

        masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask)

        final_lm_loss = tf.reduce_mean(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

        perplexity = tf.exp(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

    random_forward_embedding_shape = modeling.get_shape_list(
        random_forward_embedding, expected_rank=3)
    random_backward_embedding_shape = modeling.get_shape_list(
        random_backward_embedding, expected_rank=3)
    assert random_forward_embedding_shape[
        2] == random_backward_embedding_shape[2]
    random_forward_embedding = tf.transpose(random_forward_embedding,
                                            [1, 0, 2])
    random_backward_embedding = tf.transpose(random_backward_embedding,
                                             [1, 0, 2])
    random_forward_input_mask = tf.cast(
        tf.transpose(random_forward_input_mask, [1, 0]), tf.float32)
    random_backward_input_mask = tf.cast(
        tf.transpose(random_backward_input_mask, [1, 0]), tf.float32)

    swap_forward_embedding_shape = modeling.get_shape_list(
        swap_forward_embedding, expected_rank=3)
    swap_backward_embedding_shape = modeling.get_shape_list(
        swap_backward_embedding, expected_rank=3)
    assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2]
    swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2])
    swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2])
    swap_forward_input_mask = tf.cast(
        tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32)
    swap_backward_input_mask = tf.cast(
        tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32)

    # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3)
    # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3)
    # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2]
    # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2])
    # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2])
    # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32)
    # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32)

    nli_forward_embedding_shape = modeling.get_shape_list(
        nli_forward_embedding, expected_rank=3)
    nli_backward_embedding_shape = modeling.get_shape_list(
        nli_backward_embedding, expected_rank=3)
    assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2]
    nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2])
    nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2])
    nli_forward_input_mask = tf.cast(
        tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32)
    nli_backward_input_mask = tf.cast(
        tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32)

    model = HadeModel(
        x_random_forward=random_forward_embedding,
        x_random_mask_forward=random_forward_input_mask,
        x_random_length_forward=random_forward_text_len,
        x_random_backward=random_backward_embedding,
        x_random_mask_backward=random_backward_input_mask,
        x_random_length_backward=random_backward_text_len,
        y_random=random_labels,
        x_swap_forward=swap_forward_embedding,
        x_swap_mask_forward=swap_forward_input_mask,
        x_swap_length_forward=swap_forward_text_len,
        x_swap_backward=swap_backward_embedding,
        x_swap_mask_backward=swap_backward_input_mask,
        x_swap_length_backward=swap_backward_text_len,
        y_swap=swap_labels,
        # x_generic_forward=generic_forward_embedding,
        # x_generic_mask_forward=generic_forward_input_mask,
        # x_generic_length_forward=generic_forward_text_len,
        # x_generic_backward=generic_backward_embedding,
        # x_generic_mask_backward=generic_backward_input_mask,
        # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels,
        x_nli_forward=nli_forward_embedding,
        x_nli_mask_forward=nli_forward_input_mask,
        x_nli_length_forward=nli_forward_text_len,
        x_nli_backward=nli_backward_embedding,
        x_nli_mask_backward=nli_backward_input_mask,
        x_nli_length_backward=nli_backward_text_len,
        y_nli=nli_labels,
        embedding_dim=random_forward_embedding_shape[2],
        num_nli_labels=num_nli_labels,
        hidden_size=lstm_size,
        l2_reg_lambda=l2_reg_lambda,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
        is_training=is_training)

    random_prob, swap_prob, nli_prob, total_cost = model.create_model()

    return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity