Esempio n. 1
0
    def build_model(self):
        temp = self.all_sequence[-1]

        with tf.variable_scope("lstm"):
            temp = dropout(temp, 0.1)
            seq_len = tf.reduce_sum(self.sent_mask, axis=1)
            gru_fw = GRUCell(num_units=768, activation=tf.tanh)
            gru_bw = GRUCell(num_units=768, activation=tf.tanh)
            outputs, output_states = bidirectional_dynamic_rnn(
                gru_fw, gru_bw, temp,
                sequence_length=seq_len, dtype=tf.float32)

            gru_output = tf.concat(outputs, axis=2)
            # gru_output = dropout(gru_output, 0.1)
            gru_output = tf.layers.dense(gru_output, units=768,
                                         kernel_initializer=create_initializer(0.02))
            gru_output = dropout(gru_output, 0.1)
            outputs = layer_norm(gru_output + temp)

            in_outputs = tf.layers.dense(outputs, units=768, activation=tf.tanh,
                                         kernel_initializer=create_initializer(0.02))

            layer_output = tf.layers.dense(in_outputs, 768,
                                           kernel_initializer=create_initializer(0.02))
            layer_output = dropout(layer_output, 0.1)
            layer_output = layer_norm(layer_output + outputs)

        return layer_output
Esempio n. 2
0
def transformer_model(input_tensor,
                      neg_attention_mask,
                      num_hidden_layers=12,
                      intermediate_act_fn=gelu):
    hidden_size = num_attention_heads * size_per_head

    neg_attention_mask = tf.reshape(neg_attention_mask, [batch_size, 1, 1, seq_length])
    neg_attention_mask *= tf.ones(shape=[batch_size, num_attention_heads, seq_length, seq_length],
                                  dtype=tf.float32)

    prev_output = input_tensor

    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output

            with tf.variable_scope("attention"):
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        layer_input, neg_attention_mask,
                        query_kernel=query_kernel, query_bias=query_bias,
                        key_kernel=key_kernel, key_bias=key_bias,
                        value_kernel=value_kernel, value_bias=value_bias,
                        num_attention_heads=num_attention_heads,
                        size_per_head=size_per_head,
                        batch_size=batch_size,
                        seq_length=seq_length)

                with tf.variable_scope("output"):
                    attention_output = tf.layers.Dense(
                        hidden_size,
                        weights=[attention_output_kernel, attention_output_bias]
                    ).apply(attention_head)
                    attention_output = layer_norm(attention_output + layer_input,
                                                  beta=attention_norm_beta,
                                                  gamma=attention_norm_gamma)

            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.Dense(
                    intermediate_size,
                    activation=intermediate_act_fn,
                    weights=[intermediate_kernel, intermediate_bias]
                ).apply(attention_output)

            with tf.variable_scope("output"):
                layer_output = tf.layers.Dense(
                    hidden_size,
                    weights=[output_kernel, output_bias]
                ).apply(intermediate_output)
                layer_output = layer_norm(layer_output + attention_output,
                                          beta=output_norm_beta,
                                          gamma=output_norm_gamma)
                prev_output = layer_output

    return prev_output
Esempio n. 3
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        # sampled_logits = tf.multinomial(logits, 1)

    return log_probs
Esempio n. 4
0
def get_candi_output(bert_config, input_tensor, output_weights):
    """Get loss and log probs for the masked LM."""
    # input_tensor = gather_indexes(input_tensor, positions)
    sequence_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]
    input_tensor = tf.reshape(input_tensor, [batch_size * seq_length, width])

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        #batch*seq,vocab_size
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        log_probs = tf.reshape(log_probs, [batch_size, seq_length, -1])
        _, top_k_idx = tf.nn.top_k(log_probs, FLAGS.top_k)

    return top_k_idx
Esempio n. 5
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM.
    input_tensor --> [batch_size, seq_length, hidden_size]
    output_weights --> [vocab_size, embedding_size]
    positions --> [batch_size, max_predictions_per_seq]
    label_ids --> [batch_size, max_predictions_per_seq]
    label_weights --> [batch_size, max_predictions_per_seq]
  """
    tf.logging.info(f'get_masked_lm_output--positions:{positions}')
    input_tensor = gather_indexes(
        input_tensor,
        positions)  # [batch_size*max_predictions_per_seq, hidden_size]

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(
                input_tensor
            )  # [batch_size*max_predictions_per_seq, hidden_size]

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(
            input_tensor, output_weights, transpose_b=True
        )  # [batch_size*max_predictions_per_seq, vocab_size]
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(
            logits,
            axis=-1)  # [batch_size*max_predictions_per_seq, vocab_size]

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(
            label_weights, [-1])  # [batch_size*max_predictions_per_seq]

        one_hot_labels = tf.one_hot(
            label_ids, depth=bert_config.vocab_size, dtype=tf.float32
        )  # [batch_size*max_predictions_per_seq, vocab_size]

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])  # 交叉熵   [flat_positions]
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 6
0
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
    """From run_pretraining.py."""
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range),
            )
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            "output_bias",
            shape=[albert_config.vocab_size],
            initializer=tf.zeros_initializer(),
        )
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Esempio n. 7
0
 def add_embeddings(self):
   with tf.name_scope("embedding"):
     if self.is_Embedding_Needed:
       W = tf.Variable(np.array(self.embeddings),name="word_embed" ,dtype="float32",trainable = self.trainable )
     else:
       W=tf.get_variable(
         name='word_embed',
         shape=[self.vocab_size, self.embedding_size],
         initializer=modeling.create_initializer(0.02),trainable=True)
       
     self.embedding_W = W
     
     self.embedded_chars_q_pos= self.get_timing_signal_1d(self.max_input_left, self.embedding_size)
     
     if 'adding_problem' not in self.dataset:
       self.embedded_chars_q = tf.nn.embedding_lookup(self.embedding_W,self.question)
     else:
       #mapping 2 dim into high dim
       if self.embedding_size == 2:
         self.embedded_chars_q = self.question
       else:
         self.embedded_chars_q =  tf.layers.dense(self.question,self.embedding_size)
         print('embedded_chars_q:',self.embedded_chars_q)
     
     if 'adding_problem' not in self.dataset:
       self.embedded_chars_q = modeling.layer_norm(
         tf.nn.dropout(self.embedded_chars_q,
                     keep_prob=1.0-self.input_dropout_prob))
     
     #add the position embedding may be lead to poor performance...
     self.embedded_chars_q= self.embedded_chars_q +self.embedded_chars_q_pos
     
     print('embedded_chars_q:',self.embedded_chars_q)
Esempio n. 8
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable(
            "output_bias",
            shape=[bert_config.vocab_size],
            initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])

        one_hot_labels = tf.one_hot(
            label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
        loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        # TODO: dynamic gather from per_example_loss
    return loss
Esempio n. 9
0
    def get_masked_lm_output(self, bert_config, input_tensor, output_weights,
                             positions, label_ids):
        input_tensor, size, max_len = self.gather_indexes(
            input_tensor, positions)
        with tf.variable_scope("cls/predictions"):
            with tf.variable_scope("transform"):
                input_tensor = tf.layers.dense(
                    input_tensor,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                        bert_config.initializer_range))
                input_tensor = modeling.layer_norm(input_tensor)
            output_bias = tf.get_variable("output_bias",
                                          shape=[bert_config.vocab_size],
                                          initializer=tf.zeros_initializer())
            logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            label_ids = tf.reshape(label_ids, [-1])

            one_hot_labels = tf.one_hot(label_ids,
                                        depth=bert_config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
        return loss
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)
    with tf.variable_scope('cls/predictions'):
        with tf.variable_scope('transform'):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)
        output_bias = tf.get_variable('output_bias',
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-05
        loss = numerator / denominator
    return (loss, per_example_loss, log_probs)
    def add_embeddings(self):
        with tf.name_scope("embedding"):
            if self.is_Embedding_Needed:
                W = tf.Variable(np.array(self.embeddings),
                                name="word_embed",
                                dtype="float32",
                                trainable=self.trainable)
            else:
                W = tf.get_variable(
                    name='word_embed',
                    shape=[self.vocab_size, self.embedding_size],
                    initializer=modeling.create_initializer(0.02),
                    trainable=True)

            if 'adding_problem' not in self.dataset:
                self.embedding_W = W
                self.embedded_chars_q = tf.nn.embedding_lookup(
                    self.embedding_W, self.question)
            else:
                if self.embedding_size == 2:
                    self.embedded_chars_q = self.question
                else:
                    self.embedded_chars_q = tf.layers.dense(
                        self.question, self.embedding_size)
            print('embedded_chars_q:', self.embedded_chars_q)

            if 'adding_problem' not in self.dataset:
                self.embedded_chars_q = modeling.layer_norm(
                    tf.nn.dropout(self.embedded_chars_q,
                                  keep_prob=1.0 - self.input_dropout_prob))
Esempio n. 12
0
    def build_model(self):
        with tf.variable_scope("inferring_module"), tf.device("/device:GPU:0"):
            rdim = 768
            update_num = 3
            batch_size = tf.shape(self.sent1)[0]
            dim = self.sent1.get_shape().as_list()[-1]

            gru_layer = BiGRU(num_layers=1,
                              num_units=rdim,
                              batch_size=batch_size,
                              input_size=dim,
                              keep_prob=0.9,
                              is_train=self.is_training,
                              activation=tf.nn.tanh)
            seq_len = tf.reduce_sum(self.input_mask, axis=1)
            gru_output = gru_layer(self.all_sent, seq_len=seq_len)

            with tf.variable_scope("att"):
                all_seq_len = self.all_sent.get_shape().as_list()[1]
                cls = tf.tile(tf.expand_dims(self.mark0, axis=1),
                              [1, all_seq_len, 1])
                cat_att = tf.concat([cls, gru_output], axis=2)

                res = tf.layers.dense(cat_att,
                                      units=512,
                                      activation=tf.nn.relu)
                res = tf.layers.dense(res, units=1, use_bias=False)
                res_mask = tf.expand_dims(tf.cast(self.input_mask, tf.float32),
                                          axis=2)
                res = res - (1 - res_mask) * 10000.0

                alpha = tf.nn.softmax(res, 1)
                gru_vec = tf.reduce_sum(alpha * gru_output, axis=1)

            # gru_vec = dropout(gru_vec, self.dropout_rate)
            gru_vec = tf.layers.dense(
                gru_vec,
                768,
                activation=gelu,
                kernel_initializer=create_initializer(0.02))
            gru_vec = dropout(gru_vec, self.dropout_rate)
            gru_vec = layer_norm(gru_vec + self.mark0)
            gru_vec = tf.layers.dense(
                gru_vec,
                768,
                activation=tf.tanh,
                kernel_initializer=create_initializer(0.02))
            # gate = tf.layers.dense(tf.concat([gru_vec, self.mark0], axis=1),
            #                        rdim, activation=tf.sigmoid,
            #                        kernel_initializer=create_initializer(0.02))

            # with tf.variable_scope("merge"):
            #     # refer_output = self.mark0 * gate + (1 - gate) * gru_vec
            #     vec_cat = tf.concat([self.mark0, gru_vec], axis=1)
            #     vec_cat = dropout(vec_cat, self.dropout_rate)
            #     pooled_output = tf.layers.dense(vec_cat, 768,
            #                                     activation=tf.tanh,
            #                                     kernel_initializer=create_initializer(0.02))

        return gru_vec
Esempio n. 13
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """
  下游任务,遮蔽语言模型网络结构,获取遮蔽语言模型的loss与对数概率值
  :param bert_config: bert配置,描述bert网络结构
  :param input_tensor: 输入向量   [batch_size, seq_len, embedding_dim]
  :param output_weights: 输出权重,bert的词向量权重    [voc_size, embedding_dim]
  :param positions: 遮蔽的位置    [batch_size, masked_len]
  :param label_ids: 遮蔽的标签    [batch_size, masked_len]
  :param label_weights:  遮蔽的权重    [batch_size, masked_len]
  :return: 整体loss,每个样本的loss [batch_size*masked_len],预测的对数概率值 [batch_size*masked_len, voc_size]
  """
    # 获取输入张量中对应位置的值, [batch_size * masked_len, embedding_dim]
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # 在输出层之前,还额外进行了一次非线性转换映射,预训练完之后,这个映射不再使用
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(
                input_tensor)  # 层标准化 [batch_size * masked_len, embedding_dim]

        # 输出权重跟输入的embedding很相似,只是输出多了个bias
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(
            input_tensor, output_weights,
            transpose_b=True)  # [batch_size * masked_len, voc_size]
        logits = tf.nn.bias_add(
            logits, output_bias)  # [batch_size * masked_len, voc_size]
        log_probs = tf.nn.log_softmax(
            logits, axis=-1)  # [batch_size * masked_len, voc_size]

        label_ids = tf.reshape(label_ids, [-1])  # [batch_size * masked_len]
        label_weights = tf.reshape(label_weights,
                                   [-1])  # [batch_size * masked_len]

        # [batch_size * masked_len, voc_size]
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # 如果预测的数量小于最大数量,会补齐至最大数量,补齐的weight值为0,其他为1
        # 每一个样本的loss,[batch_size * masked_len]
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        # 根据权重计算总loss
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator  #计算平均loss

    return (loss, per_example_loss, log_probs)
    def add_embeddings(self):
        with tf.name_scope("embedding"):
            if self.is_Embedding_Needed:
                W = tf.get_variable(name="embeddings",
                                    dtype="float32",
                                    initializer=np.array(
                                        self.embeddings, np.float32),
                                    trainable=self.trainable)
            else:
                #I think we need to utilize more fine-grained word embedding~
                W = tf.get_variable(
                    name='word_embed',
                    shape=[self.vocab_size, self.embedding_size],
                    initializer=modeling.create_initializer(0.02),
                    trainable=True)

            if 'adding_problem' not in self.dataset:
                self.embedding_W = W
                self.embedded_chars_q = tf.nn.embedding_lookup(
                    self.embedding_W, self.question)

            else:
                #mapping 2 dim into high dim
                if self.embedding_size == 2:
                    self.embedded_chars_q = self.question
                else:
                    self.embedded_chars_q = tf.layers.dense(
                        self.question, self.embedding_size)
            print('embedded_chars_q:', self.embedded_chars_q)

            if 'adding_problem' not in self.dataset:
                self.embedded_chars_q = modeling.layer_norm(
                    tf.nn.dropout(self.embedded_chars_q,
                                  keep_prob=1.0 - self.input_dropout_prob))

            #(0,1)
            self.soft_t5_rd_bucket_mat = tf.sigmoid(
                tf.get_variable(
                    't5_rd_bucket_mat',
                    [2 * self.max_input_left, self.config.num_attention_heads],
                    initializer=modeling.create_initializer(0.1),
                    trainable=True))

            self.single_t5_att_bias = compute_bias(
                self.config.num_attention_heads,
                self.max_input_left,
                self.max_input_left,
                self.soft_t5_rd_bucket_mat,
                l1_width=self.config.l1_width,
                l2_width=self.config.l2_width,
                stddev=self.config.stddev,
                bidirectional=True)

            self.t5_att_bias = tf.tile(self.single_t5_att_bias,
                                       [tf.shape(self.question)[0], 1, 1, 1],
                                       name='t5_att_bias')
            print('[!!!--t5_bias:]', self.t5_att_bias)
Esempio n. 15
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    # input_tensor=model.get_sequence_output()
    # output_weights=model.get_embedding_table()
    # positions=masked_lm_positions
    # label_ids=masked_lm_ids
    # label_weights=masked_lm_weights
    # 这里的input_tensor是模型中传回的最后一层结果 [batch_size,seq_length,hidden_size]。
    # #output_weights是词向量表 [vocab_size,embedding_size]
    """Get loss and log probs for the masked LM."""  #获取positions位置的所有encoder(即要预测的那些位置的encoder)
    input_tensor = gather_indexes(
        input_tensor, positions)  #[batch_size*max_pred_pre_seq,hidden_size]

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(  #传入一个全连接层 输出shape [batch_size*max_pred_pre_seq,hidden_size]
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(
            input_tensor, output_weights,
            transpose_b=True)  #output_weights是embedding层 output_weights进行转置
        logits = tf.nn.bias_add(
            logits, output_bias
        )  #[batchsize*max_pred_pre_seq,vocal_size]  每个mask词在词表中的概率
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 16
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    '''
  (masked_lm_loss,
     masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
         bert_config, model.get_sequence_output(), model.get_embedding_table(),
         masked_lm_positions, masked_lm_ids, masked_lm_weights)
  '''
    input_tensor = gather_indexes(input_tensor, positions)  # bert mask掉的位置
    #  输出是bs*L, hidden size

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(
                input_tensor)  #  mask一个序列,所以需要layer norm

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(
            input_tensor, output_weights,
            transpose_b=True)  #   word embedding相当于label embedding
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])  #  mask掉的词的真实id
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        #  weight主要是考虑了有些mask位置是为了补足而pad的
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])  # shape = bs, L
        numerator = tf.reduce_sum(label_weights *
                                  per_example_loss)  # shape = bs
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 17
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    import ipdb
    ipdb.set_trace()
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        #input_tensor.shpae=(160,768),output_weights.shape=(21128(vocab_size),768)
        logits = tf.matmul(input_tensor, output_weights,
                           transpose_b=True)  #logits.shape=(160,21128)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        #label_ids.shape = (8,20)
        label_ids = tf.reshape(label_ids, [-1])
        #label_ids.shape = (160)
        #label_weights.shape=(8,20)
        label_weights = tf.reshape(label_weights,
                                   [-1])  #label_weights是mask的权重,
        #在本程序中,都是1
        #label_weights.shape=(160,)

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)
        #one_hot_labels.shape=(160,21128),一共160个字符,每个字符用vocab_size的
        #one_hot表示,为下文求loss做准备。

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 18
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    # [batch_size*label_size, dim]
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range)
            )  ##tf.layers.dense adds a single layer to  network. The second argument is the number of neurons/nodes of the layer.
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token. (equation 7)
        output_bias = tf.get_variable("output_bias",
                                      shape=[output_weights.shape[0]],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights,
                           transpose_b=True)  #values are not probs (sum>1)
        logits = tf.nn.bias_add(logits, output_bias)
        # logits, (bs*label_size, vocab_size)
        log_probs = tf.nn.log_softmax(
            logits, -1
        )  #log to compute log liklihood (Eq. 8). -1 indicates the last dimension.

        label_ids = tf.reshape(label_ids,
                               [-1])  #shape of [-1] flattens into 1-D
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=output_weights.shape[0],
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(
            log_probs * one_hot_labels,
            axis=[-1])  #loss per each masked position in the seq
        numerator = tf.reduce_sum(
            label_weights *
            per_example_loss)  #loss over all serialized sequence
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 19
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  # 得到MASK的LM的 loss和log概率
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.

    # 在输出之前再加一个非线性变换,这些参数只是用于训练,在Fine-Tuning的时候就不用了。
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.

    # output_weights是复用输入的word Embedding,所以是传入的,
    # 这里再多加一个bias。
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # label_ids的长度是20,表示最大的MASK的Token数
    # label_ids里存放的是MASK过的Token的id
    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

    # The `positions` tensor might be zero-padded (if the sequence is too
    # short to have the maximum number of predictions). The `label_weights`
    # tensor has a value of 1.0 for every real prediction and 0.0 for the
    # padding predictions.

    # 但是由于实际MASK的可能不到20,比如只MASK18,那么label_ids有2个0(padding)
    # 而label_weights=[1, 1, ...., 0, 0],说明后面两个label_id是padding的,计算loss要去掉。
    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)
Esempio n. 20
0
def get_masked_lm_loss(model_config, seq_output, embedding_table, positions,
                       label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3])
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(seq_output,
                                      [batch_size * seq_length, width])
    seq_output = tf.gather(flat_sequence_tensor, flat_positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                seq_output,
                units=model_config.embedding_size,
                activation=modeling.get_activation(model_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    model_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[model_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, embedding_table, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=model_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return loss, per_example_loss, log_probs
Esempio n. 21
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    '''Get loss and log probs for the masked LM'''
    input_tensor = gather_indexes(input_tensor, positions)

    # with tf.variable_scope('cls/predictions'):
    with tf.name_scope('cls/predictions'):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        # with tf.variable_scope('transform'):
        with tf.name_scope('transform'):
            # input_tensor = tf.layers.dense(
            input_tensor = tf.keras.layers.Dense(
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))(input_tensor)
            input_tensor = modeling.layer_norm(input_tensor)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        # output_bias = tf.get_variable(
        #     'output_bias',
        #     shape=[bert_config.vocab_size],
        #     initializer=tf.zeros_initializer()
        # )
        output_bias = tf.Variable(
            initial_value=tf.zeros([bert_config.vocab_size]),
            name='output_bias',
            shape=[bert_config.vocab_size],
            # initializer=tf.zeros_initializer()
        )
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 22
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(
        input_tensor, positions)  ###[batch*mask_length,hidden_size]
    ###返回的是所有mask位置对应的输出的transformer向量

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(  #####[batch*mask_length,hidden_size]
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(
                    bert_config.hidden_act),  ###Chinese config gelu
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())

        ###[batch*mask_length,hidden_size]  matmul with [vocab_size,hidden_size]
        ###[batch*mask_length,vocab_size]
        logits = tf.matmul(input_tensor, output_weights,
                           transpose_b=True)  #[batch*mask_length,vocab_size]
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(
            logits, axis=-1)  ###[batch*mask_length,vocab_size]

        label_ids = tf.reshape(label_ids, [-1])  ###[batch*mask_length]
        label_weights = tf.reshape(label_weights, [-1])  ###[batch*mask_length]

        one_hot_labels = tf.one_hot(
            label_ids, depth=bert_config.vocab_size,
            dtype=tf.float32)  ###[batch*mask_length,vocab_size]

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])  ###[batch*mask_length]
        numerator = tf.reduce_sum(label_weights *
                                  per_example_loss)  ###标量loss   mask result
        denominator = tf.reduce_sum(label_weights) + 1e-5  ###这是为了转为float??????
        loss = numerator / denominator  #average loss

    return (loss, per_example_loss, log_probs)
Esempio n. 23
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)
    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])
        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        if bert_config.loss == "original":
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            # The `positions` tensor might be zero-padded (if the sequence is too
            # short to have the maximum number of predictions). The `label_weights`
            # tensor has a value of 1.0 for every real prediction and 0.0 for the
            # padding predictions.
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            numerator = tf.reduce_sum(label_weights * per_example_loss)
            denominator = tf.reduce_sum(label_weights) + 1e-5
            loss = tf.identity(numerator / denominator, name="loss")
        elif bert_config.loss == "focal":
            log_probs = tf.nn.softmax(logits, axis=-1)
            per_example_loss = -tf.reduce_sum(one_hot_labels * (
                (1 - log_probs)**2) * tf.log(log_probs),
                                              axis=[-1])
            numerator = tf.reduce_sum(label_weights * per_example_loss)
            denominator = tf.reduce_sum(label_weights) + 1e-5
            loss = tf.identity(numerator / denominator, name="loss")

    return (loss, per_example_loss, log_probs)
Esempio n. 24
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[bert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        # 下面要注意,output_weights就是modeling中的embedding table,其shape是 [vocab_size, hidden_size],下面相乘时做一下转置即可。
        # 这样输出的logits的第二维就是 vocab_size
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        # 注意这里将softmax和取log合在了一起
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=bert_config.vocab_size,
                                    dtype=tf.float32)

        # 这段英文注释是理解label_weights的关键
        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        # 这里per_example_loss 就是 -logP P是预测成label的概率
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        # label_weights这里的作用是将padding出来的mask position的loss清零
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
Esempio n. 25
0
    def add_embeddings(self):
        with tf.name_scope("embedding"):
            if self.is_Embedding_Needed:
                W = tf.Variable(np.array(self.embeddings),
                                name="word_embed",
                                dtype="float32",
                                trainable=self.trainable)
            else:
                W = tf.get_variable(
                    name='word_embed',
                    shape=[self.vocab_size, self.embedding_size],
                    initializer=modeling.create_initializer(0.02),
                    trainable=True)

            if 'adding_problem' not in self.dataset:
                self.embedding_W = W
                self.embedded_chars_q = tf.nn.embedding_lookup(
                    self.embedding_W, self.question)
            else:
                #mapping 2 dim into high dim
                if self.embedding_size == 2:
                    self.embedded_chars_q = self.question
                else:
                    self.embedded_chars_q = tf.layers.dense(
                        self.question, self.embedding_size)
            print('embedded_chars_q:', self.embedded_chars_q)

            if 'adding_problem' not in self.dataset:
                self.embedded_chars_q = modeling.layer_norm(
                    tf.nn.dropout(self.embedded_chars_q,
                                  keep_prob=1.0 - self.input_dropout_prob))

        context_position = tf.range(self.max_input_left, dtype=tf.int32)[:,
                                                                         None]
        memory_postion = tf.range(self.max_input_left, dtype=tf.int32)[None, :]
        relative_position = memory_postion - context_position + self.max_input_left

        #why this embedding is very sensitive...
        self.t5_pos_embedding = tf.get_variable(
            't5_pos_mat',
            [2 * self.max_input_left, self.config.num_attention_heads],
            initializer=modeling.create_initializer(0.02),
            trainable=True)

        self.single_t5_att_bias = compute_bias(relative_position,
                                               self.t5_pos_embedding)
        ## [batch, num_heads, query_length, memory_length]
        self.t5_att_bias = tf.tile(self.single_t5_att_bias,
                                   [tf.shape(self.question)[0], 1, 1, 1])
        print('t5_bias:', self.t5_att_bias)
Esempio n. 26
0
def get_hydrophobicity_output(bert_config,
                              input_tensor,
                              positions,
                              label_hydrophobicities,
                              label_weights,
                              k=3,
                              log=False):
    """Get loss and log probs for the hydrophobicity prediction."""
    input_tensor = gather_indexes(input_tensor, positions)
    hydrophobicity_range = 155 * k + 1

    with tf.variable_scope("cls/hydrophobicity"):
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=bert_config.hidden_size,
                activation=modeling.get_activation(bert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

            output_weights = tf.get_variable(
                "output_weights",
                shape=[hydrophobicity_range, bert_config.hidden_size],
                initializer=modeling.create_initializer(
                    bert_config.initializer_range))
            output_bias = tf.get_variable("output_bias",
                                          shape=[hydrophobicity_range],
                                          initializer=tf.zeros_initializer())

        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)

        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_hydrophobicities = tf.reshape(label_hydrophobicities, [-1])
        label_weights = tf.reshape(label_weights, [-1])

        one_hot_labels = tf.one_hot(label_hydrophobicities,
                                    depth=hydrophobicity_range,
                                    dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return (loss, per_example_loss, log_probs)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
						 label_ids, label_weights):
 
 	input_tensor = gather_indexes(input_tensor, positions)

 	with tf.variable_scope("cls/predictions"):
 		# The weight matrix is not used after pre-training
 		with tf.variable_scope("transform"):
 			input_tensor = tf.layers.dense(
 				input_tensor,
 				units=bert_config.hidden_dims,
 				activation=modeling.get_activation(bert_config.hidden_act),
 				kernel_initializer=modeling.create_initializer(
 					bert_config.initializer_std))
 			input_tensor = modeling.layer_norm(input_tensor)

 		# The output weights are the same as input embeddings, but there is 
 		# an bias vector for the output for each token
 		output_bias = tf.get_variable(
 			"output_bias",
 			shape=[bert_config.vocab_size],
 			initializer=tf.zeros_initializer())
 		logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
 		logits = tf.nn.bias_add(logits, output_bias)
 		# log_softmax applies logarithm after softmax.
		#softmax:
		#exp(x_i) / exp(x).sum()
		#log_softmax:
		#log( exp(x_i) / exp(x).sum() )
 		log_probs = tf.nn.log_softmax(logits, axis=-1)

 		label_ids = tf.reshape(label_ids, [-1])
 		label_weights = tf.reshape(label_weights, [-1])

 		one_hot_labels = tf.one_hot(label_ids,
 									depth=bert_config.vocab_size,
 									dtype=tf.float32)

 		# 'label_weights' tensor has a value of 1.0 for 
 		# real predictions and 0.0 for the padding predictions.
 		per_example_loss = tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
 		numerator = tf.reduce_sum(label_weights * per_example_loss)
 		denominator = tf.reduce_sum(label_weights) + 1e-5
 		# compute the loss only from the real predictions
 		loss = numerator / denominator

 	return (loss, per_example_loss, log_probs)
Esempio n. 28
0
def get_lm_output(config, input_tensor, output_weights, label_ids, label_mask):
    """Get loss and log probs for the LM."""
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    input_tensor = tf.reshape(
        input_tensor, [input_shape[0] * input_shape[1], input_shape[2]])

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=config.hidden_size,
                activation=modeling.get_activation(config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])

        label_mask = tf.reshape(label_mask, [input_shape[0] * input_shape[1]])
        loss_mask = tf.dtypes.cast(label_mask, tf.float32)
        per_example_loss = tf.math.multiply(per_example_loss, loss_mask)
        loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, log_probs)
Esempio n. 29
0
def bert_crf(bert_config, is_training, input_ids, segment_ids, input_mask,
             label_ids, sequence_length, num_labels, use_one_hot_embeddings):

    batch_size = tf.shape(input_ids)[0]
    bert_out = bert(bert_config, is_training, input_ids, input_mask,
                    segment_ids, use_one_hot_embeddings)
    #    hidden_size = tf.shape(bert_out)[-1]
    hidden_size = 768
    if is_training:
        bert_out = layer_norm_and_dropout(bert_out, 0.5)
    else:
        bert_out = layer_norm(bert_out)
    bert_out = tf.reshape(bert_out, [-1, hidden_size])
    linear_out = linear_layer(bert_out, hidden_size, num_labels, "linear")
    crf_out = crf_layer(linear_out, label_ids, batch_size, sequence_length,
                        num_labels, max_seq_length, "crf")
    return crf_out
Esempio n. 30
0
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
    input_tensor = gather_indexes(input_tensor, mlm_positions)
    with tf.variable_scope("cls/predictions"):
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=albert_config.embedding_size,
                activation=modeling.get_activation(albert_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    albert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        output_bias = tf.get_variable("output_bias",
                                      shape=[albert_config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
    return logits
Esempio n. 31
0
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)

  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=bert_config.hidden_size,
          activation=modeling.get_activation(bert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              bert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[bert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)

    # The `positions` tensor might be zero-padded (if the sequence is too
    # short to have the maximum number of predictions). The `label_weights`
    # tensor has a value of 1.0 for every real prediction and 0.0 for the
    # padding predictions.
    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)