Ejemplo n.º 1
0
        def build_net(_reshape_list_vec, need_pos=True):
            pos_embedding = tf.get_variable(
                shape=[_rank_size, hidden_size],
                dtype=tf.float32,
                initializer=tf.initializers.truncated_normal(mean=0.0,
                                                             stddev=0.01),
                trainable=True,
                name="pos_embedding")

            hidden_reshape_list_vec = tf.layers.dense(reshape_list_vec,
                                                      hidden_size)

            hidden_list_vec = tf.reshape(hidden_reshape_list_vec,
                                         [-1, 20, hidden_size])
            if need_pos:
                hidden_list_vec = hidden_list_vec + pos_embedding

            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            outputs = modeling.transformer_model(
                input_tensor=hidden_list_vec,
                hidden_size=hidden_size,
                num_hidden_layers=2,
                num_attention_heads=1,
                intermediate_size=hidden_size * 4,
                do_return_all_layers=False)

            outputs = tf.squeeze(tf.layers.dense(outputs, 1, activation=None),
                                 axis=-1)

            return outputs
Ejemplo n.º 2
0
    def __init__(self, config, input_embedding, input_mask=None):
        input_shape = modeling.get_shape_list(input_embedding, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        # Keep variable names the same as BERT
        with tf.variable_scope("bert"):
            with tf.variable_scope("encoder"):
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_embedding, input_mask)

                all_encoder_layers = modeling.transformer_model(
                    input_tensor=input_embedding,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

                self.sequence_output = all_encoder_layers[-1]
Ejemplo n.º 3
0
    def build_bert_model(self, input_ids, input_mask, token_type_ids):
        with tf.variable_scope('bert'):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (embedding_output, _) = modeling.embedding_lookup(
                    input_ids=input_ids,
                    vocab_size=self.bert_config.vocab_size,
                    embedding_size=self.bert_config.hidden_size,
                    initializer_range=self.bert_config.initializer_range,
                    word_embedding_name="word_embeddings",
                    use_one_hot_embeddings=False)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                embedding_output = modeling.embedding_postprocessor(
                    input_tensor=embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=self.bert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=self.bert_config.initializer_range,
                    max_position_embeddings=self.bert_config.
                    max_position_embeddings,
                    dropout_prob=self.bert_config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # Run the stacked transformer, only fetching the final lyaer
                # `final_layer` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=self.bert_config.hidden_size,
                    num_hidden_layers=self.bert_config.num_hidden_layers,
                    num_attention_heads=self.bert_config.num_attention_heads,
                    intermediate_size=self.bert_config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                      self.bert_config.hidden_act),
                    hidden_dropout_prob=self.bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                      self.bert_config.attention_probs_dropout_prob,
                    initializer_range=self.bert_config.initializer_range,
                    do_return_all_layers=True
                )

            self.sequence_output = self.all_encoder_layers[-1]
Ejemplo n.º 4
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 image_embeddings,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for a visually grounded BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".
    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2)
        batch_size = text_input_shape[0]
        text_seq_length = text_input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, text_seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, text_seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(scope, default_name="bert"):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

                # Add image embeddings the rest of the input embeddings.
                self.embedding_output += tf.layers.dense(
                    image_embeddings,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.embedding_output, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
Ejemplo n.º 5
0
    def __init__(self, max_entity_num, max_sentence_num, max_relation_num,
                 max_seq_length, entity_types, class_num, bert_config,
                 hidden_size, hidden_layers, attention_heads,
                 intermediate_size, hidden_dropout_prob,
                 attention_probs_dropout_prob, graph_hidden_dropout_prob,
                 graph_attention_probs_dropout_prob):
        max_node_num = max_sentence_num + max_entity_num

        self.input_ids = tf.placeholder(shape=[None, max_seq_length],
                                        dtype=tf.int32,
                                        name="input_ids")
        self.input_mask = tf.placeholder(shape=[None, max_seq_length],
                                         dtype=tf.int32,
                                         name="input_mask")
        self.segment_ids = tf.placeholder(shape=[None, max_seq_length],
                                          dtype=tf.int32,
                                          name="segment_ids")
        self.entity_types = tf.placeholder(shape=[None, max_node_num],
                                           dtype=tf.int32,
                                           name="entity_types")
        self.entity_mask = tf.placeholder(
            shape=[None, max_entity_num, max_seq_length],
            dtype=tf.float32,
            name="entity_mask")
        self.sentence_mask = tf.placeholder(
            shape=[None, max_sentence_num, max_seq_length],
            dtype=tf.float32,
            name="sentence_mask")
        self.relation_mask = tf.placeholder(shape=[None, max_relation_num],
                                            dtype=tf.float32,
                                            name="relation_mask")
        self.attention_mask = tf.placeholder(
            shape=[None, max_node_num, max_node_num],
            dtype=tf.float32,
            name="graph_mask")
        self.head_mask = tf.placeholder(
            shape=[None, max_relation_num, max_entity_num],
            dtype=tf.float32,
            name="head_mask")
        self.tail_mask = tf.placeholder(
            shape=[None, max_relation_num, max_entity_num],
            dtype=tf.float32,
            name="tail_mask")
        self.multi_labels = tf.placeholder(
            shape=[None, max_relation_num, class_num],
            dtype=tf.int32,
            name="multi_labels")
        self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")

        self.hidden_dropout_prob = tf.cond(self.is_training,
                                           lambda: hidden_dropout_prob,
                                           lambda: 0.0)
        self.attention_probs_dropout_prob = tf.cond(
            self.is_training, lambda: attention_probs_dropout_prob,
            lambda: 0.0)

        self.graph_hidden_dropout_prob = tf.cond(
            self.is_training, lambda: graph_hidden_dropout_prob, lambda: 0.0)
        self.graph_attention_probs_dropout_prob = tf.cond(
            self.is_training, lambda: graph_attention_probs_dropout_prob,
            lambda: 0.0)

        self.entity_type_embedding = tf.get_variable(
            shape=[entity_types, 32],
            dtype=tf.float32,
            name="entity_type_embedding")
        self.entity_type_rep = tf.nn.embedding_lookup(
            self.entity_type_embedding, self.entity_types)

        self.seq_rep = self.bert_encoder(bert_config, self.hidden_dropout_prob,
                                         self.attention_probs_dropout_prob,
                                         self.input_ids, self.input_mask,
                                         self.segment_ids)
        self.entity_rep = tf.matmul(self.entity_mask, self.seq_rep)
        self.sentence_rep = tf.matmul(self.sentence_mask, self.seq_rep)
        self.graph_rep = tf.concat([self.entity_rep, self.sentence_rep],
                                   axis=1)
        self.graph_rep = tf.concat([self.graph_rep, self.entity_type_rep],
                                   axis=-1)
        self.graph_rep = tf.layers.dense(self.graph_rep, hidden_size,
                                         tf.nn.relu)
        self.final_rep = modeling.transformer_model(
            input_tensor=self.graph_rep,
            attention_mask=self.attention_mask,
            hidden_size=hidden_size,
            num_hidden_layers=hidden_layers,
            num_attention_heads=attention_heads,
            intermediate_size=intermediate_size,
            hidden_dropout_prob=self.graph_hidden_dropout_prob,
            attention_probs_dropout_prob=self.
            graph_attention_probs_dropout_prob)
        self.entity_rep = self.final_rep[:, :max_entity_num]
        self.head_rep = tf.matmul(self.head_mask, self.entity_rep)
        self.tail_rep = tf.matmul(self.tail_mask, self.entity_rep)
        bi_hidden_size = self.head_rep.get_shape().as_list()[-1]
        self.logits = self.bilinear_function(self.head_rep, self.tail_rep,
                                             bi_hidden_size, class_num)
        self.sigmoid = tf.sigmoid(self.logits, name='sigmoid')
        self.entropy = tf.losses.sigmoid_cross_entropy(
            self.multi_labels, self.logits, reduction=tf.losses.Reduction.NONE)
        self.loss = tf.reduce_sum(
            tf.multiply(self.entropy,
                        tf.expand_dims(self.relation_mask,
                                       axis=-1))) / tf.reduce_sum(
                                           self.relation_mask)
Ejemplo n.º 6
0
    def __init__(self, config, is_training, input_tensor, input_mask,
                 token_type_ids):
        """Constructor for BertFlexEmbeddingModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_tensor: float32 Tensor of shape [batch_size, seq_length,
        hidden_size].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE):
            with tf.variable_scope("embeddings"):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=input_tensor,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    input_tensor, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))
Ejemplo n.º 7
0
def create_mask_model(bert_config, is_training, input_ids, input_mask,
                      segment_ids, mask_positions, use_one_hot_embeddings):
    """Creates a classification model."""

    #print("create mask model ----------------------------------------------")
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # Get the logits for the start and end predictions.
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/nq/output_weights", [2, hidden_size + 12],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("cls/nq/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    mask_positions_matrix = tf.cast(tf.reshape(mask_positions,
                                               [batch_size * seq_length, 1]),
                                    dtype=tf.float32)
    padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32)
    mask_positions_matrix = tf.concat([mask_positions_matrix, padding],
                                      axis=-1)
    final_hidden_matrix = tf.concat(
        [final_hidden_matrix, mask_positions_matrix], axis=-1)
    final_hidden_matrix = tf.reshape(
        final_hidden_matrix, [batch_size, seq_length, hidden_size + 12])
    attention_mask = modeling.create_attention_mask_from_input_mask(
        input_ids, input_mask)
    config = bert_config
    all_encoder_layers = modeling.transformer_model(
        input_tensor=final_hidden_matrix,
        attention_mask=attention_mask,
        hidden_size=config.hidden_size + 12,  # input hidden size
        num_hidden_layers=1,  #config.num_hidden_layers,
        num_attention_heads=config.num_attention_heads,
        intermediate_size=config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(config.hidden_act),
        hidden_dropout_prob=config.hidden_dropout_prob,
        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
        initializer_range=config.initializer_range,
        do_return_all_layers=True)
    #print(all_encoder_layers.shape)
    transformer_output_matrix = all_encoder_layers[-1]

    transformer_output_matrix = tf.reshape(
        transformer_output_matrix, [batch_size * seq_length, hidden_size + 12])
    logits = tf.matmul(transformer_output_matrix,
                       output_weights,
                       transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    # Get the logits for the answer type prediction.
    answer_type_output_layer = model.get_pooled_output()
    answer_type_hidden_size = answer_type_output_layer.shape[-1].value

    num_answer_types = 5  # YES, NO, UNKNOWN, SHORT, LONG
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)

    return (start_logits, end_logits, answer_type_logits)
Ejemplo n.º 8
0
    def __init__(self, config, features, _rank_size, trainable=True, scope="train", batch_size=None, training=True):
        rnn_hidden_size = 128
        hidden_size = 128

        go_emb = tf.get_variable(shape=[rnn_hidden_size],
                                 dtype=tf.float32,
                                 initializer=tf.initializers.truncated_normal(mean=0.0,
                                                                              stddev=0.01),
                                 name="go_embedding")

        encoder_gru_fn = tf.keras.layers.GRU(rnn_hidden_size, return_state=True, return_sequences=True)

        cand_vec = features['features']

        cand_len = tf.string_to_number(features['features_mask'], out_type=tf.int32)
        cand_mask = tf.sequence_mask(cand_len,
                                     utils.seq_max_len(config, 'features'))

        encoder_vec, _ = encoder_gru_fn(cand_vec, mask=cand_mask)

        pos_embedding = tf.get_variable(shape=[_rank_size, hidden_size],
                                        dtype=tf.float32,
                                        initializer=tf.initializers.truncated_normal(mean=0.0,
                                                                                     stddev=0.01),
                                        trainable=True,
                                        name="pos_embedding")

        hidden_cand_vec = tf.layers.dense(cand_vec, hidden_size)
        hidden_cand_vec = hidden_cand_vec + pos_embedding

        # Run the stacked transformer.
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
        encoder_vec = modeling.transformer_model(
            input_tensor=hidden_cand_vec,
            hidden_size=hidden_size,
            num_hidden_layers=2,
            num_attention_heads=1,
            intermediate_size=hidden_size * 4,
            do_return_all_layers=False)

        self._point_vec = encoder_vec

        decoder_gru_fn = tf.keras.layers.GRU(rnn_hidden_size, return_state=True)

        list_vec = features['selected_vec']  # [B, seq_len, dense_dim]

        list_vec = tf.layers.dense(list_vec, hidden_size, activation=tf.nn.relu, name='dense1', reuse=tf.AUTO_REUSE)
        list_vec = tf.layers.dense(list_vec, hidden_size, activation=None, name='dense2', reuse=tf.AUTO_REUSE)

        seq_len = tf.string_to_number(features['selected_vec_mask'], out_type=tf.int32)
        seq_mask = tf.sequence_mask(seq_len,
                                    utils.seq_max_len(config, 'features'))

        selected_len = tf.string_to_number(features['selected_vec_mask'], out_type=tf.float32)
        not_first = tf.minimum(selected_len, 1.0)

        # outputs[B, seq_len, embedding_size]
        _, state = decoder_gru_fn(list_vec, mask=seq_mask)

        go_vec = tf.matmul(tf.expand_dims(1 - not_first, axis=1), tf.expand_dims(go_emb, axis=0))
        query_vec = go_vec + tf.expand_dims(not_first, axis=1) * state

        def output_fn(_query_vec, _point_vec):
            atten_vec = tf.expand_dims(_query_vec, axis=1) * _point_vec
            logits = tf.reduce_sum(atten_vec, axis=-1)

            rank_mask = features["rank_mask"]
            # mask logist
            neg_mask = rank_mask - tf.ones(shape=[1, _rank_size], dtype=tf.float32)
            neg_mask = neg_mask * 1000
            action_distribution = tf.nn.softmax(logits + neg_mask)

            return action_distribution

        self._action_distribution = output_fn(query_vec, encoder_vec)

        if 'point_vec' in features and 'last_state' in features and 'last_vec' in features:
            inc_seq_len = tf.minimum(seq_len, 1)
            inc_seq_mask = tf.sequence_mask(inc_seq_len, 1)
            last_vec = features['last_vec']
            last_vec = tf.layers.dense(last_vec, hidden_size, activation=tf.nn.relu, name='dense1', reuse=tf.AUTO_REUSE)
            last_vec = tf.layers.dense(last_vec, hidden_size, activation=None, name='dense2', reuse=tf.AUTO_REUSE)
            _, next_state = decoder_gru_fn(tf.expand_dims(last_vec, axis=1),
                                           initial_state=features['last_state'], mask=inc_seq_mask)
            inc_query_vec = go_vec + tf.expand_dims(not_first, axis=1) * next_state
            self._inc_action_distribution = output_fn(inc_query_vec, features['point_vec'])
            self._next_state = next_state
Ejemplo n.º 9
0
def main(args):
    bert_config = modeling.BertConfig.from_json_file(args.config)
    bert_config.hidden_dropout_prob = 0.0
    bert_config.attention_probs_dropout_prob = 0.0

    batch_size = args.batch_size
    avg_seq_len = args.avg_seq_length
    max_seq_len = args.max_seq_length
    tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32

    # fake input array length
    input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len,
                                  high=max_seq_len + 1,
                                  size=(batch_size),
                                  dtype=np.int32)
    valid_word_num = sum(input_len)

    # fake input id and mask
    input_ids = np.random.randint(low=0,
                                  high=bert_config.vocab_size,
                                  size=(batch_size, max_seq_len),
                                  dtype=np.int32)
    input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32)
    for b_idx, s_len in enumerate(input_len):
        input_mask[b_idx][:s_len] = 1

    input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
    input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32)

    # fake embedding output
    embed_output = np.random.randn(batch_size, max_seq_len,
                                   bert_config.hidden_size)
    input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype)

    # keep attention_mask for compatible reason
    att_mask = np.tile(input_mask, max_seq_len)
    att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len)
    attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype)

    # input info
    valid_word_num = sum(input_len)
    print("Valid word num : {}/{}, avg sequence length : {:.6} ".format(
        valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size))

    # bert with standard transformer
    std_bert = modeling.transformer_model(
        input_tensor=input_tensor,
        attention_mask=attention_mask,
        hidden_size=bert_config.hidden_size,
        num_hidden_layers=bert_config.num_hidden_layers,
        num_attention_heads=bert_config.num_attention_heads,
        intermediate_size=bert_config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(bert_config.hidden_act),
        hidden_dropout_prob=bert_config.hidden_dropout_prob,
        attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
        initializer_range=bert_config.initializer_range,
        do_return_all_layers=False)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.Session(config=config) as sess:
        # init weights
        sess.run(tf.global_variables_initializer())

        # get transformer weights
        all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        transformer_vars = [v for v in all_vars if v.name.startswith('layer')]
        weights_value = sess.run(transformer_vars)

        # bert with effective transformer
        et_bert = effective_transformer.get_sequence_output(
            max_batch_size=batch_size,
            max_seq_length=max_seq_len,
            config=bert_config,
            attention_mask=attention_mask,
            input_mask=input_mask_tensor,
            from_tensor=input_tensor,
            weights_value=weights_value,
        )

        # diff
        val1 = sess.run(std_bert).reshape(-1, 768)
        val2 = sess.run(et_bert).reshape(-1, 768)
        diff = []
        for b_idx, s_len in enumerate(input_len):
            for w_idx in range(s_len):
                idx = b_idx * args.max_seq_length + w_idx
                diff.append(np.fabs(val1[idx] - val2[idx]).max())
        print("max diff : {:.6}, avg diff : {:.6}.".format(
            max(diff),
            sum(diff) / len(diff)))

        def time_inference(output_tensor):
            iter_num = 128
            # warm up
            for i in range(10):
                sess.run(output_tensor)

            beg = datetime.now()
            for i in range(iter_num):
                sess.run(output_tensor)
            end = datetime.now()
            return (end - beg).total_seconds() * 1000 / iter_num  # ms

        print("xla cost : {:.6} ms".format(time_inference(std_bert)))
        print("et  cost : {:.6} ms".format(time_inference(et_bert)))
Ejemplo n.º 10
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=True,
                 num_labels=2,
                 max_seq_length=128):
        """Constructor for BertModel.

        Args:
          config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
            it is much faster if this is True, on the CPU or GPU, it is faster if
            this is False.
          scope: (optional) variable scope. Defaults to "bert".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        self.input_ids = tf.placeholder(dtype=tf.int32,
                                        shape=(None, max_seq_length))
        self.input_mask = tf.placeholder(dtype=tf.int8,
                                         shape=(None, max_seq_length))

        config = copy.deepcopy(config)

        input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                  dtype=tf.int32)

        with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):
            with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = modeling.embedding_lookup(
                     input_ids=self.input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = modeling.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = modeling.create_attention_mask_from_input_mask(
                    self.input_ids, self.input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = modeling.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=modeling.create_initializer(
                        config.initializer_range))

        # define output_weights and output_bias
        hidden_size = self.pooled_output.shape[-1].value
        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            self.output_weights = tf.get_variable(
                "output_weights", [num_labels, hidden_size],
                initializer=tf.truncated_normal_initializer(stddev=0.02))
            self.output_bias = tf.get_variable(
                "output_bias", [num_labels],
                initializer=tf.zeros_initializer())
Ejemplo n.º 11
0
def create_bilstm_classification_model(bert_config,
                                       is_training,
                                       response_input_ids,
                                       response_input_mask,
                                       response_segment_ids,
                                       response_text_len,
                                       response_labels,
                                       random_forward_input_ids,
                                       random_forward_input_mask,
                                       random_forward_segment_ids,
                                       random_forward_text_len,
                                       random_backward_input_ids,
                                       random_backward_input_mask,
                                       random_backward_segment_ids,
                                       random_backward_text_len,
                                       random_labels,
                                       swap_forward_input_ids,
                                       swap_forward_input_mask,
                                       swap_forward_segment_ids,
                                       swap_forward_text_len,
                                       swap_backward_input_ids,
                                       swap_backward_input_mask,
                                       swap_backward_segment_ids,
                                       swap_backward_text_len,
                                       swap_labels,
                                       nli_forward_input_ids,
                                       nli_forward_input_mask,
                                       nli_forward_segment_ids,
                                       nli_forward_text_len,
                                       nli_backward_input_ids,
                                       nli_backward_input_mask,
                                       nli_backward_segment_ids,
                                       nli_backward_text_len,
                                       nli_labels,
                                       num_nli_labels,
                                       use_one_hot_embeddings,
                                       l2_reg_lambda=0.1,
                                       dropout_rate=1.0,
                                       lstm_size=None,
                                       num_layers=1):

    config = copy.deepcopy(bert_config)

    if not is_training:
        config.hidden_dropout_prob = 0.0
        config.attention_probs_dropout_prob = 0.0

    with tf.variable_scope("bert", reuse=tf.AUTO_REUSE):

        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            (response_embedding_output,
             response_embedding_table) = modeling.embedding_lookup(
                 input_ids=response_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            response_embedding_output = modeling.embedding_postprocessor(
                input_tensor=response_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=response_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # random detection
            # Perform embedding lookup on the word ids.
            (random_foward_embedding_output,
             random_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Perform embedding lookup on the word ids.
            (random_backward_embedding_output,
             random_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=random_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            # Add positional embeddings and token type embeddings, then layer
            # normalize and perform dropout.
            random_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            random_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=random_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=random_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # swap detection
            (swap_foward_embedding_output,
             swap_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)

            (swap_backward_embedding_output,
             swap_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=swap_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            swap_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            swap_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=swap_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=swap_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

            # # generic detection
            # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_forward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup(
            #     input_ids=generic_backward_input_ids,
            #     vocab_size=config.vocab_size,
            #     embedding_size=config.hidden_size,
            #     initializer_range=config.initializer_range,
            #     word_embedding_name="word_embeddings",
            #     use_one_hot_embeddings=use_one_hot_embeddings)
            # generic_foward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_foward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_forward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)
            # generic_backward_embedding_output = modeling.embedding_postprocessor(
            #     input_tensor=generic_backward_embedding_output,
            #     use_token_type=not config.roberta,
            #     token_type_ids=generic_backward_segment_ids,
            #     token_type_vocab_size=config.type_vocab_size,
            #     token_type_embedding_name="token_type_embeddings",
            #     use_position_embeddings=True,
            #     position_embedding_name="position_embeddings",
            #     initializer_range=config.initializer_range,
            #     max_position_embeddings=config.max_position_embeddings,
            #     dropout_prob=config.hidden_dropout_prob,
            #     roberta=config.roberta)

            # nli detection
            (nli_foward_embedding_output,
             nli_forward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_forward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            (nli_backward_embedding_output,
             nli_backward_embedding_table) = modeling.embedding_lookup(
                 input_ids=nli_backward_input_ids,
                 vocab_size=config.vocab_size,
                 embedding_size=config.hidden_size,
                 initializer_range=config.initializer_range,
                 word_embedding_name="word_embeddings",
                 use_one_hot_embeddings=use_one_hot_embeddings)
            nli_foward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_foward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_forward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)
            nli_backward_embedding_output = modeling.embedding_postprocessor(
                input_tensor=nli_backward_embedding_output,
                use_token_type=not config.roberta,
                token_type_ids=nli_backward_segment_ids,
                token_type_vocab_size=config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=config.initializer_range,
                max_position_embeddings=config.max_position_embeddings,
                dropout_prob=config.hidden_dropout_prob,
                roberta=config.roberta)

        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            response_attention_mask = modeling.create_attention_mask_from_input_mask(
                response_input_ids, response_input_mask)
            # [batch_size, from_seq_length, to_seq_length]
            # mask future tokens
            diag_vals = tf.ones_like(response_attention_mask[0, :, :])
            tril = tf.linalg.LinearOperatorLowerTriangular(
                diag_vals).to_dense()
            future_masks = tf.tile(tf.expand_dims(
                tril, 0), [tf.shape(response_attention_mask)[0], 1, 1])
            response_attention_mask = tf.math.multiply(response_attention_mask,
                                                       future_masks)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            response_all_encoder_layers = modeling.transformer_model(
                input_tensor=response_embedding_output,
                attention_mask=response_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # random detection
            # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
            # mask of shape [batch_size, seq_length, seq_length] which is used
            # for the attention scores.
            random_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_forward_input_ids, random_forward_input_mask)
            random_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                random_backward_input_ids, random_backward_input_mask)
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            random_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_foward_embedding_output,
                attention_mask=random_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            random_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=random_backward_embedding_output,
                attention_mask=random_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # swap detection
            swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_forward_input_ids, swap_forward_input_mask)
            swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                swap_backward_input_ids, swap_backward_input_mask)
            swap_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_foward_embedding_output,
                attention_mask=swap_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            swap_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=swap_backward_embedding_output,
                attention_mask=swap_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            # # generic detection
            # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids,
            #                                                                                 generic_forward_input_mask)
            # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids,
            #                                                                                  generic_backward_input_mask)
            # generic_forward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_foward_embedding_output,
            #     attention_mask=generic_forward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)
            # generic_backward_all_encoder_layers = modeling.transformer_model(
            #     input_tensor=generic_backward_embedding_output,
            #     attention_mask=generic_backward_attention_mask,
            #     hidden_size=config.hidden_size,
            #     num_hidden_layers=config.num_hidden_layers,
            #     num_attention_heads=config.num_attention_heads,
            #     intermediate_size=config.intermediate_size,
            #     intermediate_act_fn=modeling.get_activation(config.hidden_act),
            #     hidden_dropout_prob=config.hidden_dropout_prob,
            #     attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            #     initializer_range=config.initializer_range,
            #     do_return_all_layers=True)

            # nli detection
            nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_forward_input_ids, nli_forward_input_mask)
            nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask(
                nli_backward_input_ids, nli_backward_input_mask)
            nli_forward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_foward_embedding_output,
                attention_mask=nli_forward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)
            nli_backward_all_encoder_layers = modeling.transformer_model(
                input_tensor=nli_backward_embedding_output,
                attention_mask=nli_backward_attention_mask,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        random_forward_embedding = random_forward_all_encoder_layers[-2]
        random_backward_embedding = random_backward_all_encoder_layers[-2]
        swap_forward_embedding = swap_forward_all_encoder_layers[-2]
        swap_backward_embedding = swap_backward_all_encoder_layers[-2]
        # generic_forward_embedding = generic_forward_all_encoder_layers[-2]
        # generic_backward_embedding = generic_backward_all_encoder_layers[-2]
        nli_forward_embedding = nli_forward_all_encoder_layers[-2]
        nli_backward_embedding = nli_backward_all_encoder_layers[-2]
        response_embedding = response_all_encoder_layers[-2]

    response_embedding_shape = modeling.get_shape_list(response_embedding,
                                                       expected_rank=3)
    with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE):

        response_logits = tf.layers.dense(response_embedding,
                                          config.hidden_size,
                                          activation=None)
        response_logits = modeling.gelu(response_logits)
        response_logits = modeling.layer_norm(response_logits)
        response_outputs = tf.layers.dense(
            response_logits,
            config.vocab_size,
            activation=None,
            use_bias=True,
            bias_initializer=tf.zeros_initializer())

        response_one_hot = tf.one_hot(response_labels,
                                      depth=config.vocab_size,
                                      dtype=tf.float32)

        lm_cost = tf.nn.softmax_cross_entropy_with_logits(
            labels=response_one_hot, logits=response_outputs)

        sequence_mask = tf.sequence_mask(response_text_len,
                                         maxlen=response_embedding_shape[1],
                                         dtype=tf.float32)

        masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask)

        final_lm_loss = tf.reduce_mean(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

        perplexity = tf.exp(
            tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1),
                           tf.cast(response_text_len, dtype=tf.float32)))

    random_forward_embedding_shape = modeling.get_shape_list(
        random_forward_embedding, expected_rank=3)
    random_backward_embedding_shape = modeling.get_shape_list(
        random_backward_embedding, expected_rank=3)
    assert random_forward_embedding_shape[
        2] == random_backward_embedding_shape[2]
    random_forward_embedding = tf.transpose(random_forward_embedding,
                                            [1, 0, 2])
    random_backward_embedding = tf.transpose(random_backward_embedding,
                                             [1, 0, 2])
    random_forward_input_mask = tf.cast(
        tf.transpose(random_forward_input_mask, [1, 0]), tf.float32)
    random_backward_input_mask = tf.cast(
        tf.transpose(random_backward_input_mask, [1, 0]), tf.float32)

    swap_forward_embedding_shape = modeling.get_shape_list(
        swap_forward_embedding, expected_rank=3)
    swap_backward_embedding_shape = modeling.get_shape_list(
        swap_backward_embedding, expected_rank=3)
    assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2]
    swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2])
    swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2])
    swap_forward_input_mask = tf.cast(
        tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32)
    swap_backward_input_mask = tf.cast(
        tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32)

    # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3)
    # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3)
    # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2]
    # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2])
    # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2])
    # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32)
    # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32)

    nli_forward_embedding_shape = modeling.get_shape_list(
        nli_forward_embedding, expected_rank=3)
    nli_backward_embedding_shape = modeling.get_shape_list(
        nli_backward_embedding, expected_rank=3)
    assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2]
    nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2])
    nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2])
    nli_forward_input_mask = tf.cast(
        tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32)
    nli_backward_input_mask = tf.cast(
        tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32)

    model = HadeModel(
        x_random_forward=random_forward_embedding,
        x_random_mask_forward=random_forward_input_mask,
        x_random_length_forward=random_forward_text_len,
        x_random_backward=random_backward_embedding,
        x_random_mask_backward=random_backward_input_mask,
        x_random_length_backward=random_backward_text_len,
        y_random=random_labels,
        x_swap_forward=swap_forward_embedding,
        x_swap_mask_forward=swap_forward_input_mask,
        x_swap_length_forward=swap_forward_text_len,
        x_swap_backward=swap_backward_embedding,
        x_swap_mask_backward=swap_backward_input_mask,
        x_swap_length_backward=swap_backward_text_len,
        y_swap=swap_labels,
        # x_generic_forward=generic_forward_embedding,
        # x_generic_mask_forward=generic_forward_input_mask,
        # x_generic_length_forward=generic_forward_text_len,
        # x_generic_backward=generic_backward_embedding,
        # x_generic_mask_backward=generic_backward_input_mask,
        # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels,
        x_nli_forward=nli_forward_embedding,
        x_nli_mask_forward=nli_forward_input_mask,
        x_nli_length_forward=nli_forward_text_len,
        x_nli_backward=nli_backward_embedding,
        x_nli_mask_backward=nli_backward_input_mask,
        x_nli_length_backward=nli_backward_text_len,
        y_nli=nli_labels,
        embedding_dim=random_forward_embedding_shape[2],
        num_nli_labels=num_nli_labels,
        hidden_size=lstm_size,
        l2_reg_lambda=l2_reg_lambda,
        num_layers=num_layers,
        dropout_rate=dropout_rate,
        is_training=is_training)

    random_prob, swap_prob, nli_prob, total_cost = model.create_model()

    return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity
Ejemplo n.º 12
0
def aggregate_embedding(embeddings,
                        segment_idx,
                        aggregator,
                        config=None,
                        aux=None,
                        name=None):
    # segment_idx denotes different needles, rather than rows.
    if aggregator == 'segment_sqrt_n':
        denom = to_col(
            tf.sqrt(
                tf.to_float(
                    tf.segment_sum(tf.ones_like(segment_idx), segment_idx))))
        output_layer = tf.div_no_nan(tf.segment_sum(embeddings, segment_idx),
                                     denom,
                                     name=name)
    elif aggregator in ['segment_sum', 'segment_mean']:
        output_layer = getattr(tf, aggregator)(embeddings,
                                               segment_idx,
                                               name=name)
    else:
        del embeddings
        assert aggregator.startswith('transformer')
        flags = {}
        if '^' in aggregator:
            flags = [
                kv.split('@')
                for kv in filter(None,
                                 aggregator.split('^')[1].split(','))
            ]
            flags = {k: eval(v) for k, v in flags}
        assert config is not None and aux is not None
        needle_pos = aux['needle_pos']
        embedding_output = aux['sequence_output']
        batch_idx2 = aux['batch_idx2']  # different rows.
        is_training = aux['is_training']
        attention_mask = get_dense_mask(needle_pos, batch_idx2,
                                        tf.shape(embedding_output)[:2])
        with tf.variable_scope('final_transformer'):
            all_encoder_layers = modeling.transformer_model(
                input_tensor=embedding_output,
                attention_mask=attention_mask,
                hidden_size=config.
                hidden_size,  # this must agree with input width.
                num_hidden_layers=flags.get('num_hidden_layers', 1),
                num_attention_heads=flags.get('num_attention_heads',
                                              config.num_attention_heads),
                intermediate_size=flags.get('intermediate_size',
                                            config.intermediate_size),
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=flags.get('hidden_dropout_prob',
                                              config.hidden_dropout_prob) *
                int(is_training),
                attention_probs_dropout_prob=int(is_training) *
                flags.get('attention_probs_dropout_prob',
                          config.attention_probs_dropout_prob),
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

            first_token_tensor = all_encoder_layers[-1][:, 0, :]
            output_layer = tf.layers.dense(
                first_token_tensor,
                config.hidden_size,
                activation=tf.tanh,
                kernel_initializer=modeling.create_initializer(
                    config.initializer_range))

    return output_layer