Python get_shape_list Exemples, modeling.get_shape_list Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : cqa_model.py Projet : yangliuy/bert_hae

def cqa_model(final_hidden):
    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "cls/cqa/output_weights", [2, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "cls/cqa/output_bias", [2], initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size])
    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits = tf.reshape(logits, [batch_size, seq_length, 2])
    logits = tf.transpose(logits, [2, 0, 1])

    unstacked_logits = tf.unstack(logits, axis=0)

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    return (start_logits, end_logits)

Exemple #2

0

Afficher le fichier

Fichier : run_pretraining.py Projet : Wanke15/bert

def gather_indexes(sequence_tensor, positions):
  """Gathers the vectors at the specific positions over a minibatch."""
  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
  batch_size = sequence_shape[0]
  seq_length = sequence_shape[1]
  width = sequence_shape[2]

  flat_offsets = tf.reshape(
      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
  flat_positions = tf.reshape(positions + flat_offsets, [-1])
  flat_sequence_tensor = tf.reshape(sequence_tensor,
                                    [batch_size * seq_length, width])
  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
  return output_tensor

Exemple #3

0

Afficher le fichier

Fichier : validation.py Projet : ilham-bintang/Malaya

def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank = 3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype = tf.int32) * seq_length, [-1, 1]
    )
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(
        sequence_tensor, [batch_size * seq_length, width]
    )
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor

Exemple #4

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings, scope):
    """Creates a classification model."""

    with tf.variable_scope('bert', reuse=tf.AUTO_REUSE) as real_scope:
        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            scope=scope)

    final_hidden = model.get_sequence_output()
    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    return final_hidden[:, 0, :]

Exemple #5

0

Afficher le fichier

def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    # sequence_tensor, [B, seq_len, hidden_dim]，bert最后一层输出
    # positions, [B, masked_token_num]
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])  # 为了下面展开后一次性查找
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor,
                              flat_positions)  # [B*mask_token_num, hidden_dim]
    return output_tensor

Exemple #6

0

Afficher le fichier

def get_lm_output(config, input_tensor, output_weights, label_ids, label_mask):
    """Get loss and log probs for the LM."""
    input_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
    input_tensor = tf.reshape(
        input_tensor, [input_shape[0] * input_shape[1], input_shape[2]])

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                input_tensor,
                units=config.hidden_size,
                activation=modeling.get_activation(config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        output_bias = tf.get_variable("output_bias",
                                      shape=[config.vocab_size],
                                      initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=config.vocab_size,
                                    dtype=tf.float32)

        # The `positions` tensor might be zero-padded (if the sequence is too
        # short to have the maximum number of predictions). The `label_weights`
        # tensor has a value of 1.0 for every real prediction and 0.0 for the
        # padding predictions.
        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])

        label_mask = tf.reshape(label_mask, [input_shape[0] * input_shape[1]])
        loss_mask = tf.dtypes.cast(label_mask, tf.float32)
        per_example_loss = tf.math.multiply(per_example_loss, loss_mask)
        loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, log_probs)

Exemple #7

0

Afficher le fichier

def create_model_start(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  output_weights1 = tf.get_variable(
      "cls/squad/output_weights1", [384, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias1 = tf.get_variable(
      "cls/squad/output_bias1", [384], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  
  keep_prob = 1.0
  if is_training:
    keep_prob = 0.9
  else:
    keep_prob = 1.0

  logits = tf.matmul(final_hidden_matrix, output_weights1, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias1)
  logits = tf.nn.relu(logits)
  logits = tf.nn.dropout(logits, keep_prob)

  logits = tf.reshape(logits, [batch_size, seq_length, 384])
  logits = tf.transpose(logits, [2, 0, 1])
  
  unstacked_logits = tf.unstack(logits, axis=0)
  s = tf.reduce_sum(unstacked_logits[0:384], 0)

  return s

Exemple #8

0

Afficher le fichier

Fichier : run_seq.py Projet : zhaohangong/quark_cws

            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                # batch_size * sequence_len
                shape_list = modeling.get_shape_list(label_ids, expected_rank=2)
                label_ids = tf.reshape(label_ids, [-1])
                is_real_example = tf.tile(is_real_example[:, tf.newaxis], [1, shape_list[1]])
                is_real_example = tf.reshape(is_real_example, [-1])
                per_example_loss = tf.reshape(per_example_loss, [-1])
                logits = tf.reshape(logits, [-1])

                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=predictions, weights=is_real_example)
                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

Exemple #9

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  #out1 model.
  #pdb.set_trace()
  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  #out2 output_weights
  output_weights = tf.get_variable(
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  #out3 output_bias
  output_bias = tf.get_variable(
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  #out4 logits * 2
  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  #out5 start_logits end_logits
  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)

Exemple #10

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_sequence_output()                                      #Pooler前一层输出
    final_hidden_shape = modeling.get_shape_list(output_layer, expected_rank=3)
    hidden_size = final_hidden_shape[-1]

    output_weight = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02)
    )
    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer()
    )
    with tf.variable_scope("loss"):
        if is_training:
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        output_layer_matrix = tf.reshape(output_layer, [-1, hidden_size])               # [batch_size*seq_length, hidden_size]
        logits = tf.matmul(output_layer_matrix, output_weight, transpose_b=True)        # [batch_size*seg_length, num_label]
        logits = tf.nn.bias_add(logits, output_bias)
        logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels])             # 最后一层输出[batch_size, seq_length, num_label]
        # mask = tf.cast(input_mask,tf.float32)
        # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask)
        # return (loss, logits, predict)
        ##########################################################################
        log_probs = tf.nn.log_softmax(logits, axis=-1)                                  # 对输出的softmax归一化
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)          # 每个样本的损失值
        loss = tf.reduce_mean(per_example_loss)                                          # 总的损失值
        predict = tf.argmax(log_probs, axis=-1)                                         # 预测值
        return (loss, per_example_loss, logits, predict)

Exemple #11

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
    """Creates a classification model."""
    #初始化bert模型参数
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,  #输入格式转换后的队列 “”
        input_mask=input_mask,  #屏蔽
        token_type_ids=segment_ids,  #输入的句子分段ID
        use_one_hot_embeddings=use_one_hot_embeddings)
    #从bert模型读取序列输出
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]  #批次数量大小
    seq_length = final_hidden_shape[1]  #序列长度
    hidden_size = final_hidden_shape[2]  #隐藏单元大小

    #初始化权重矩阵
    output_weights = tf.get_variable(
        "cls/squad/output_weights", [2, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    #初始化偏置项
    output_bias = tf.get_variable("cls/squad/output_bias", [2],
                                  initializer=tf.zeros_initializer())

    final_hidden_matrix = tf.reshape(final_hidden,
                                     [batch_size * seq_length, hidden_size])
    #回归训练，求解y=ax+b  ，这种模型结构，a=output_weights，b=output_bias，y=logits
    logits = tf.matmul(final_hidden_matrix, output_weights,
                       transpose_b=True)  #矩阵相乘
    logits = tf.nn.bias_add(logits, output_bias)  #加上偏置项

    logits = tf.reshape(
        logits,
        [batch_size, seq_length, 2])  #把输出转换成[每批数量，序列长度，2] 这种格式[2维数组数,行数，列数]
    logits = tf.transpose(logits, [2, 0, 1])  #对上一步的结果转置，[2,0,1]代表[列数，2维数组数，行数]

    unstacked_logits = tf.unstack(logits, axis=0)  #对矩阵在行上拆分

    (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

    return (start_logits, end_logits)  #返回起始位置，结束位置

Exemple #12

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  #self.sequence_output = self.all_encoder_layers[-1]
  #取最后一层（batch_size，seq_length，hidden_size）
  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  output_weights = tf.get_variable(
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  ###如果这里只想做一个填空形式，例如只判断某个词是否为开始位置
  ###可以加个全连接层[hidden_size,1],相乘后得到[batch_size, seq_length, 1],sequeeze去掉维度为1的那个维度；
  ###最终得到[batch_size, seq_length]就可以用多分类判断seq_length的每个位置是否为答案的位置

  return (start_logits, end_logits)

Exemple #13

0

Afficher le fichier

Fichier : run_seq.py Projet : zhaohangong/quark_cws

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a seq labelling model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)
    with tf.variable_scope("finetune/seq"):
        #get sequnece output
        final_hidden = model.get_sequence_output()
        final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
        batch_size = final_hidden_shape[0]
        seq_length = final_hidden_shape[1]
        hidden_size = final_hidden_shape[2]
        final_hidden = tf.reshape(final_hidden, [batch_size*seq_length, hidden_size])
        output_weights = tf.get_variable(
            "output_weights", [num_labels, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))
        output_bias = tf.get_variable(
            "output_bias", [num_labels], initializer=tf.zeros_initializer())

    logits = tf.matmul(final_hidden, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    logits_out = tf.reshape(logits, [batch_size, seq_length, num_labels])
    probabilities = tf.nn.softmax(logits_out)

    log_probs = tf.nn.log_softmax(logits)

    labels = tf.reshape(labels, [-1])
    label_weights = tf.cast(tf.reshape(input_mask, [-1]), dtype=tf.float32)
    one_hot_labels = tf.one_hot(
        labels, depth=num_labels, dtype=tf.float32)

    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) * label_weights
    numerator = tf.reduce_sum(per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator/denominator

    per_example_loss = tf.reshape(per_example_loss, [batch_size, seq_length])
    return (loss, per_example_loss, logits_out, probabilities)

Exemple #14

0

Afficher le fichier

Fichier : run_kgc.py Projet : Shinya-Kouda/kgc

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()#Bertの最終層

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  output_weights = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)

Exemple #15

0

Afficher le fichier

Fichier : predict_seq.py Projet : rusiaaman/bert

def get_new_input_ids(input_ids,
              masked_lm_probs,masked_lm_positions,masked_lm_weights):
  
  #Reshaping the masked_lm_probs
  sequence_shape = modeling.get_shape_list(masked_lm_positions, expected_rank=2) 
  #for getting the shape info
  batch_size = sequence_shape[0]
  seq_length = sequence_shape[1]

  # Gathering the first masks only. shape = (batch_size,vocab_size)
  mask_lab_pred = tf.gather(masked_lm_probs,
                            tf.range(0,batch_size*seq_length,delta=seq_length),axis=0)
  #Getting the indexes for the predictions. shape = (batch_size,1)
  #mask_lab_pred = tf.random.multinomial(mask_lab_pred,1,output_dtype=tf.int32)
  mask_lab_pred = tf.reshape(tf.cast(tf.argmax(mask_lab_pred,
                  axis=-1),dtype=tf.int32),(batch_size,1))
  # Gathering positions of the first mask. shape=(batch_size,)
  mask_positions = tf.gather(masked_lm_positions,
                          tf.constant(0,dtype=tf.int32),axis=-1)
  #Assigning the first masks in input_ids
  #--First converting mask_positions to one hot, shape=(batch_size,seq_length)
  mask_positions = tf.one_hot(mask_positions,depth=seq_length,axis=1,dtype=tf.int32)
  #--Removing those positions there is no mask left
  mask_positions = tf.cast(tf.reduce_max(masked_lm_positions,axis=-1,keepdims=True)>0
                        ,dtype=tf.int32)*mask_positions

  #--Next multiplying mask_positions_one_hot to mask_lab_pred and forming new_ids
  input_ids = mask_positions*mask_lab_pred+(1-mask_positions)*input_ids

  #Setting the first mask lm_weights to be zero

  masked_lm_weights = tf.slice(masked_lm_weights,(0,1),(-1,-1))
  masked_lm_weights = tf.concat([masked_lm_weights,tf.zeros(shape=(batch_size,1),
                                                dtype=tf.float32)],axis=-1)

  #Setting the masked_lm_positions first masks to be zero
  masked_lm_positions = tf.slice(masked_lm_positions,(0,1),(-1,-1))
  masked_lm_positions = tf.concat([masked_lm_positions,tf.zeros(shape=(batch_size,1),
                                                dtype=tf.int32)],axis=-1)

  
  return input_ids,masked_lm_positions,masked_lm_weights

Exemple #16

0

Afficher le fichier

Fichier : run_sequence_tagging.py Projet : zhpmatrix/bert-sequence-tagging

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    final_hidden = model.get_sequence_output()
    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())
    with tf.variable_scope("loss"):
        final_hidden_matrix = tf.reshape(
            final_hidden, [batch_size * seq_length, hidden_size])
        logits = tf.matmul(final_hidden_matrix,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

        logits = tf.reshape(logits, [batch_size, seq_length, num_labels])
        #logits = tf.transpose(logits, [2,0,1])
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

    return (loss, per_example_loss, logits)

Exemple #17

0

Afficher le fichier

def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor,
                                             expected_rank=3)  #list
    batch_size = sequence_shape[
        0]  #'tensorflow.python.framework.ops.Tensor', Tensor("strided_slice_1:0", shape=(), dtype=int32)
    seq_length = sequence_shape[1]  #256
    width = sequence_shape[2]  #64

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])  #(?, 1)
    flat_positions = tf.reshape(positions + flat_offsets, [-1])  #(?,)
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(
        flat_sequence_tensor, flat_positions
    )  #Gather slices from params axis axis according to indices.
    ##!for seq in tf.range(0, batch_size, dtype=tf.int32):
    ## output_tensor = tf.gather(sequence_tensor[seq], positions)
    return output_tensor

Exemple #18

0

Afficher le fichier

def gather_indexes(
    sequence_tensor, positions
):  ###sequence_tensor 是transorformer的输出[batch_size,length,hidden_size]
    ###position是每个case中被mask的位置index，[batch,max_mask_length]，默认max_mask_length长度为20  每个实例：[7,10,15,20,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length,
        [-1, 1])  ###[batch_size,1]
    flat_positions = tf.reshape(positions + flat_offsets,
                                [-1])  #获得所有mask的position，并flat成一维
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor,
                              flat_positions)  ###获取被mask位置对应transformer的输出
    return output_tensor

Exemple #19

0

Afficher le fichier

def get_matrix_mask_indices(matrix, num_rows=None):
    if num_rows is None:
        num_rows = modeling.get_shape_list(matrix)[0]
    indices = tf.where(matrix)
    num_indices = tf.shape(indices)[0]
    elem_per_row = tf.bincount(tf.cast(indices[:, 0], tf.int32),
                               minlength=num_rows)
    max_elem_per_row = tf.reduce_max(elem_per_row)
    row_start = tf.concat([[0], tf.cumsum(elem_per_row[:-1])], axis=0)
    r = tf.range(max_elem_per_row)
    idx = tf.expand_dims(row_start, 1) + r
    idx = tf.minimum(idx, num_indices - 1)
    result = tf.gather(indices[:, 1], idx)
    # replace invalid elements with -1
    result = tf.where(
        tf.expand_dims(elem_per_row, 1) > r, result, -tf.ones_like(result))
    max_index_per_row = tf.reduce_max(result, axis=1, keepdims=True)
    max_index_per_row = tf.tile(max_index_per_row, [1, max_elem_per_row])
    result = tf.where(result >= 0, result, max_index_per_row)
    return result

Exemple #20

0

Afficher le fichier

Fichier : run_pretraining.py Projet : ycpan/my_model

def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    #这个函数主要是从指定位置的position中（即被mask的位置），从输入batch_size *
    #seqlength * embedding_size的tensor中，找出来被mask的字符向量。将设batch_size=8,
    #,sequength_length的长度为128，在这个128的字符中，有20个位置被mask，则本函数就是要找出
    #8 * 20 = 160个mask调的字符，每个字符用768的向量表示，则最终输出的结果为160*768
    import ipdb
    ipdb.set_trace()
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]
    #由于后文要将sequence_tensor进行flat操作，即batch_size*sequence_length操作，所以
    #这里要对position进行一个位置的offset（偏置），依次向后偏置sequence_length操作。
    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor

Exemple #21

0

Afficher le fichier

def knn(labels, embeddings, k, embed_normed=True):
    # make sure embedding should be l2-normalized
    if not embed_normed:
        embeddings = tf.nn.l2_normalize(embeddings, axis=1)

    embed_shape = modeling.get_shape_list(embeddings)
    batch_size = embed_shape[0]
    sim_mat = tf.matmul(embeddings, embeddings, transpose_b=True)
    sim_mat = sim_mat - tf.eye(batch_size) * 2.0

    _, top_k_idx = tf.nn.top_k(sim_mat, k)
    top_k_labels = tf.squeeze(tf.gather(labels, top_k_idx))

    def knn_vote(v):
        nearest_k_y, idx, votes = tf.unique_with_counts(v)
        majority_idx = tf.argmax(votes)
        predict_res = tf.gather(nearest_k_y, majority_idx)
        return predict_res

    majority = tf.map_fn(knn_vote, top_k_labels)
    return majority

Exemple #22

0

Afficher le fichier

Fichier : run_pretraining.py Projet : hsha0/BERT_RACE

def replace_elements_by_indices(old, new, indices):
    old_shape = modeling.get_shape_list(old)
    print(old_shape)
    batch_size = old_shape[0]
    seq_length = old_shape[1]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    print(flat_offsets)
    flat_positions = tf.reshape(indices + flat_offsets, [-1])
    print(flat_positions)

    zeros = tf.zeros(tf.shape(input=flat_positions)[0], dtype=tf.int32)
    print(zeros)

    flat_old = tf.reshape(old, [-1])
    print(flat_old)

    masked_lm_mask = tf.compat.v1.sparse_to_dense(flat_positions,
                                                  tf.shape(input=flat_old),
                                                  zeros,
                                                  default_value=1,
                                                  validate_indices=True,
                                                  name="masked_lm_mask")
    print(masked_lm_mask)

    flat_old_temp = tf.multiply(flat_old, masked_lm_mask)
    print(flat_old_temp)
    new_temp = tf.compat.v1.sparse_to_dense(flat_positions,
                                            tf.shape(input=flat_old),
                                            new,
                                            default_value=0,
                                            validate_indices=True,
                                            name=None)
    print(new_temp)

    updated_old = tf.reshape(flat_old_temp + new_temp, old_shape)
    print(updated_old)

    return updated_old

Exemple #23

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_len, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a sequence model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    final_hidden = model.get_sequence_output()
    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]
    output_size = num_labels

    with tf.variable_scope("bert_finetuning"):
        output_weights = tf.get_variable(
            "token_output_weights", [output_size, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable(
            "token_output_bias", [output_size], initializer=tf.zeros_initializer())

        final_hidden_matrix = tf.reshape(final_hidden,
                                         [batch_size * seq_length, hidden_size])
        logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        logits = tf.reshape(logits, [batch_size, seq_length, output_size])

        one_hot_labels = tf.one_hot(labels, depth=num_labels, axis=-1, dtype=tf.float32)
        entropy_loss = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels, logits=logits, dim=-1,
                                                               name="loss")
        per_example_loss = tf.reduce_sum(tf.slice(entropy_loss,begin=[0,1],size=[-1,input_len[0]]), axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        probs = tf.nn.softmax(logits, axis=-1)

        return (loss, per_example_loss, probs, logits)

Exemple #24

0

Afficher le fichier

Fichier : run_squad.py Projet : Wanke15/bert

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  output_weights = tf.get_variable(
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)

Exemple #25

0

Afficher le fichier

def get_shuffle_loss(model_config, seq_output, label_ids, label_weights):
    sequence_shape = modeling.get_shape_list(seq_output, expected_rank=[3])
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    seq_output = tf.reshape(seq_output, [-1, width])
    with tf.variable_scope("cls/shuffle"):
        with tf.variable_scope("transform"):
            seq_output = tf.layers.dense(
                seq_output,
                units=seq_length,
                activation=modeling.get_activation(model_config.hidden_act),
                kernel_initializer=modeling.create_initializer(
                    model_config.initializer_range))
            seq_output = modeling.layer_norm(seq_output)

        output_bias = tf.get_variable("output_bias",
                                      shape=[seq_length],
                                      initializer=tf.zeros_initializer())

        logits = tf.nn.bias_add(seq_output, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        label_ids = tf.reshape(label_ids, [-1])
        label_weights = tf.reshape(tf.cast(label_weights, tf.float32), [-1])

        one_hot_labels = tf.one_hot(label_ids,
                                    depth=seq_length,
                                    dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                          axis=[-1])
        numerator = tf.reduce_sum(label_weights * per_example_loss)
        denominator = tf.reduce_sum(label_weights) + 1e-5
        loss = numerator / denominator

    return loss, per_example_loss, log_probs

Exemple #26

0

Afficher le fichier

    def create_model(self, bert_config, is_training, input_ids, input_mask,
                     segment_ids, use_one_hot_embeddings):
        """Creates a classification model."""
        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            compute_type=tf.float32)

        final_hidden = model.get_sequence_output()

        final_hidden_shape = modeling.get_shape_list(final_hidden,
                                                     expected_rank=3)
        batch_size = final_hidden_shape[0]
        seq_length = final_hidden_shape[1]
        hidden_size = final_hidden_shape[2]

        output_weights = tf.get_variable(
            "cls/squad/output_weights", [2, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("cls/squad/output_bias", [2],
                                      initializer=tf.zeros_initializer())

        final_hidden_matrix = tf.reshape(
            final_hidden, [batch_size * seq_length, hidden_size])
        logits = tf.matmul(final_hidden_matrix,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

        logits = tf.reshape(logits, [batch_size, seq_length, 2])
        return logits

Exemple #27

0

Afficher le fichier

Fichier : generator_peterrec_non.py Projet : zhijieqiu/sigir2020_peterrec

    def get_masked_lm_output(self, bert_config, input_tensor, output_weights,
                             positions, label_ids, label_weights, trainable):
        """Get loss and log probs for the masked LM."""
        input_tensor = self.gather_indexes(input_tensor, positions)

        if self.is_negsample:
            logits_2D = input_tensor
            label_flat = tf.reshape(
                label_ids, [-1, 1])  # 1 is the number of positive example
            num_sampled = int(
                0.2 * self.model_para['item_size'])  # sample 20% as negatives
            loss = tf.nn.sampled_softmax_loss(self.softmax_w, self.softmax_b,
                                              label_flat, logits_2D,
                                              num_sampled,
                                              self.model_para['item_size'])
        else:
            sequence_shape = modeling.get_shape_list(positions)
            batch_size = sequence_shape[0]
            seq_length = sequence_shape[1]
            residual_channels = input_tensor.get_shape().as_list()[-1]
            input_tensor = tf.reshape(input_tensor,
                                      [-1, seq_length, residual_channels])

            logits = ops.conv1d(tf.nn.relu(input_tensor),
                                self.model_para['item_size'],
                                name='logits')
            logits_2D = tf.reshape(logits, [-1, self.model_para['item_size']])
            label_flat = tf.reshape(label_ids, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=label_flat, logits=logits_2D)
        loss = tf.reduce_mean(loss)

        #not sure the impact, 0.001 is empirical value
        # regularization = 0.001 * tf.reduce_mean([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
        # loss=loss+regularization
        return loss

Exemple #28

0

Afficher le fichier

Fichier : run_squad.py Projet : ilineicry/cnbert

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (start_logits, end_logits) = create_model(
        bert_config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      seq_length = modeling.get_shape_list(input_ids)[1]

      def compute_loss(logits, positions):  #交叉熵损失函数
        #把位置数转换成独热标签
        one_hot_positions = tf.one_hot(
            positions, depth=seq_length, dtype=tf.float32)
        #对softmax结果取ln对数，计算熵的参数，logits部分的信息量
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        #先累加（独热标签乘以每个位的概率），然后取均值
        loss = -tf.reduce_mean(
            tf.reduce_sum(one_hot_positions * log_probs, axis=-1))   #熵的计算公式正确值与（预测值概率对数）相乘的累加
        return loss

      start_positions = features["start_positions"]
      end_positions = features["end_positions"]
      #计算答案开始位置的损失
      start_loss = compute_loss(start_logits, start_positions)
      #计算答案结束位置的损失
      end_loss = compute_loss(end_logits, end_positions)
      #总的损失
      total_loss = (start_loss + end_loss) / 2.0
      #调用模型优化函数
      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.PREDICT:
      predictions = {
          "unique_ids": unique_ids,
          "start_logits": start_logits,
          "end_logits": end_logits,
      }
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    else:
      raise ValueError(
          "Only TRAIN and PREDICT modes are supported: %s" % (mode))

    return output_spec

Exemple #29

0

Afficher le fichier

Fichier : run_squad.py Projet : ahashisyuu/EntityExtractionBiandata

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        unique_ids = features["unique_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]

        is_training = (mode == tfes.estimator.ModeKeys.TRAIN)

        (start_logits, end_logits) = create_model(
            bert_config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        # tf.logging.info("**** Trainable Variables ****")
        # for var in tvars:
        #     init_string = ""
        #     if var.name in initialized_variable_names:
        #         init_string = ", *INIT_FROM_CKPT*"
        #     tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
        #                     init_string)

        output_spec = None
        if mode == tfes.estimator.ModeKeys.TRAIN:
            seq_length = modeling.get_shape_list(input_ids)[1]

            def compute_loss(logits, positions):
                one_hot_positions = tf.one_hot(positions,
                                               depth=seq_length,
                                               dtype=tf.float32)
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                loss = -tf.reduce_mean(
                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
                return loss

            start_positions = features["start_positions"]
            end_positions = features["end_positions"]

            start_loss = compute_loss(start_logits, start_positions)
            end_loss = compute_loss(end_logits, end_positions)

            total_loss = (start_loss + end_loss) / 2.0

            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = tpu.TPUEstimatorSpec(mode=mode,
                                               loss=total_loss,
                                               train_op=train_op,
                                               scaffold_fn=scaffold_fn)
        elif mode == tfes.estimator.ModeKeys.PREDICT:

            # outer = tf.matmul(tf.expand_dims(tf.nn.softmax(start_logits), axis=2),
            #                   tf.expand_dims(tf.nn.softmax(end_logits), axis=1))
            # outer = tf.matrix_band_part(outer, -1, 15)  # 取上3角15条对角线，表示答案最大长度只能取到15+1个单词
            # yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)  # 寻找最大值在L1轴的索引
            # yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

            predictions = {
                "unique_ids": unique_ids,
                "start_logits": start_logits,
                "end_logits": end_logits,
                # "yp1": yp1,
                # "yp2": yp2,
            }
            output_spec = tpu.TPUEstimatorSpec(mode=mode,
                                               predictions=predictions,
                                               scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and PREDICT modes are supported: %s" %
                             (mode))

        return output_spec

Exemple #30

0

Afficher le fichier

Fichier : run_squad.py Projet : sohn21c/gcp-ai-platform-ngc-examples

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for Estimator."""
    if FLAGS.verbose_logging:
        tf.compat.v1.logging.info("*** Features ***")
        for name in sorted(features.keys()):
          tf.compat.v1.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    if not is_training and FLAGS.use_trt:
        trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, use_one_hot_embeddings, init_checkpoint)
        (start_logits, end_logits) = tf.import_graph_def(trt_graph,
                input_map={'input_ids':input_ids, 'input_mask':input_mask, 'segment_ids':segment_ids},
                return_elements=['unstack:0', 'unstack:1'],
                name='')
        predictions = {
            "unique_ids": unique_ids,
            "start_logits": start_logits,
            "end_logits": end_logits,
        }
        output_spec = tf.estimator.EstimatorSpec(
            mode=mode, predictions=predictions)
        return output_spec

    (start_logits, end_logits) = create_model(
        bert_config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    if init_checkpoint and (hvd is None or hvd.rank() == 0):
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      
      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    if FLAGS.verbose_logging:
        tf.compat.v1.logging.info("**** Trainable Variables ****")
        for var in tvars:
          init_string = ""
          if var.name in initialized_variable_names:
            init_string = ", *INIT_FROM_CKPT*"
          tf.compat.v1.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
                          init_string)


    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      seq_length = modeling.get_shape_list(input_ids)[1]

      def compute_loss(logits, positions):
        one_hot_positions = tf.one_hot(
            positions, depth=seq_length, dtype=tf.float32)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        loss = -tf.reduce_mean(
            tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
        return loss

      start_positions = features["start_positions"]
      end_positions = features["end_positions"]

      start_loss = compute_loss(start_logits, start_positions)
      end_loss = compute_loss(end_logits, end_positions)

      total_loss = (start_loss + end_loss) / 2.0

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, amp, FLAGS.num_accumulation_steps)

      output_spec = tf.estimator.EstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op)
    elif mode == tf.estimator.ModeKeys.PREDICT:

      dummy_op = tf.no_op()
      # Need to call mixed precision graph rewrite if fp16 to enable graph rewrite
      if amp:
        loss_scaler = tf.train.experimental.FixedLossScale(1)
        dummy_op = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimization.LAMBOptimizer(learning_rate=0.0), loss_scaler)

      predictions = {
          "unique_ids": unique_ids,
          "start_logits": start_logits,
          "end_logits": end_logits,
      }
      output_spec = tf.estimator.EstimatorSpec(
          mode=mode, predictions=predictions)
    else:
      raise ValueError(
          "Only TRAIN and PREDICT modes are supported: %s" % (mode))

    return output_spec

Exemple #31

0

Afficher le fichier

def create_model(bert_config, is_training, slot_list, features,
                 num_class_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]

    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    class_output_layer = model.get_pooled_output()
    token_output_layer = model.get_sequence_output()

    token_output_shape = modeling.get_shape_list(token_output_layer,
                                                 expected_rank=3)
    batch_size = token_output_shape[0]
    seq_length = token_output_shape[1]
    hidden_size = token_output_shape[2]

    # Define prediction variables
    class_proj_layer_dim = [hidden_size]
    for idx in range(FLAGS.num_class_hidden_layer):
        class_proj_layer_dim.append(64)
    class_proj_layer_dim.append(num_class_labels)

    token_proj_layer_dim = [hidden_size]
    for idx in range(FLAGS.num_token_hidden_layer):
        token_proj_layer_dim.append(64)
    token_proj_layer_dim.append(2)

    if is_training:
        # I.e., 0.1 dropout
        class_output_layer = tf.nn.dropout(class_output_layer,
                                           keep_prob=(1 - FLAGS.dropout_rate))
        token_output_layer = tf.nn.dropout(token_output_layer,
                                           keep_prob=(1 - FLAGS.dropout_rate))
    total_loss = 0
    per_slot_per_example_loss = {}
    per_slot_class_logits = {}
    per_slot_start_logits = {}
    per_slot_end_logits = {}
    for slot in slot_list:
        start_pos = features["start_pos_%s" % slot]
        end_pos = features["end_pos_%s" % slot]
        class_label_id = features["class_label_id_%s" % slot]
        slot_scope_name = "slot_%s" % slot
        if slot == 'price range':
            slot_scope_name = "slot_price"
        with tf.variable_scope(slot_scope_name):
            class_list_output_weights = []
            class_list_output_bias = []

            for l_idx in range(len(class_proj_layer_dim) - 1):
                dim_in = class_proj_layer_dim[l_idx]
                dim_out = class_proj_layer_dim[l_idx + 1]
                class_list_output_weights.append(
                    tf.get_variable(
                        "class/output_weights_%d" % l_idx, [dim_in, dim_out],
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.02)))
                class_list_output_bias.append(
                    tf.get_variable("class/output_bias_%d" % l_idx, [dim_out],
                                    initializer=tf.zeros_initializer()))

            token_list_output_weights = []
            token_list_output_bias = []

            for l_idx in range(len(token_proj_layer_dim) - 1):
                dim_in = token_proj_layer_dim[l_idx]
                dim_out = token_proj_layer_dim[l_idx + 1]
                token_list_output_weights.append(
                    tf.get_variable(
                        "token/output_weights_%d" % l_idx, [dim_in, dim_out],
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.02)))
                token_list_output_bias.append(
                    tf.get_variable("token/output_bias_%d" % l_idx, [dim_out],
                                    initializer=tf.zeros_initializer()))

            with tf.variable_scope("loss"):
                class_logits = util.fully_connect_layers(
                    class_output_layer, class_list_output_weights,
                    class_list_output_bias)
                one_hot_class_labels = tf.one_hot(class_label_id,
                                                  depth=num_class_labels,
                                                  dtype=tf.float32)
                class_loss = tf.losses.softmax_cross_entropy(
                    one_hot_class_labels,
                    class_logits,
                    reduction=tf.losses.Reduction.NONE)

                token_is_pointable = tf.cast(tf.equal(class_label_id, 2),
                                             dtype=tf.float32)

                token_output_layer = tf.reshape(
                    token_output_layer, [batch_size * seq_length, hidden_size])
                token_logits = util.fully_connect_layers(
                    token_output_layer, token_list_output_weights,
                    token_list_output_bias)
                token_logits = tf.reshape(token_logits,
                                          [batch_size, seq_length, 2])
                token_logits = tf.transpose(token_logits, [2, 0, 1])
                unstacked_token_logits = tf.unstack(token_logits, axis=0)
                (start_logits, end_logits) = (unstacked_token_logits[0],
                                              unstacked_token_logits[1])

                def compute_loss(logits, positions):
                    one_hot_positions = tf.one_hot(positions,
                                                   depth=seq_length,
                                                   dtype=tf.float32)
                    log_probs = tf.nn.log_softmax(logits, axis=1)
                    loss = -tf.reduce_sum(one_hot_positions * log_probs,
                                          axis=1)
                    return loss

                token_loss = (
                    compute_loss(start_logits, start_pos) +
                    compute_loss(end_logits, end_pos)) / 2.0  # per example
                if not FLAGS.location_loss_for_nonpointable:
                    token_loss *= token_is_pointable

                per_example_loss = FLAGS.class_loss_ratio * class_loss + (
                    1 - FLAGS.class_loss_ratio) * token_loss

                total_loss += tf.reduce_sum(per_example_loss)
                per_slot_per_example_loss[slot] = per_example_loss
                per_slot_class_logits[slot] = class_logits
                per_slot_start_logits[slot] = start_logits
                per_slot_end_logits[slot] = end_logits
    return (total_loss, per_slot_per_example_loss, per_slot_class_logits,
            per_slot_start_logits, per_slot_end_logits)

Exemple #32

0

Afficher le fichier

Fichier : hae.py Projet : yangliuy/bert_hae

    use_one_hot_embeddings=False
    )
    
(start_logits, end_logits) = cqa_model(bert_representation)


tvars = tf.trainable_variables()

initialized_variable_names = {}
if FLAGS.init_checkpoint:
    (assignment_map, initialized_variable_names) = modeling.get_assigment_map_from_checkpoint(tvars, 
                                                                                              FLAGS.init_checkpoint)
    tf.train.init_from_checkpoint(FLAGS.init_checkpoint, assignment_map)

# compute loss
seq_length = modeling.get_shape_list(input_ids)[1]
def compute_loss(logits, positions):
    one_hot_positions = tf.one_hot(
        positions, depth=seq_length, dtype=tf.float32)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    loss = -tf.reduce_mean(tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
    return loss

# get the max prob for the predicted start/end position
start_probs = tf.nn.softmax(start_logits, axis=-1)
start_prob = tf.reduce_max(start_probs, axis=-1)
end_probs = tf.nn.softmax(end_logits, axis=-1)
end_prob = tf.reduce_max(end_probs, axis=-1)

start_loss = compute_loss(start_logits, start_positions)
end_loss = compute_loss(end_logits, end_positions)

Exemple #33

0

Afficher le fichier

def gec_create_model(bert_config, is_training, input_sequence, 
  input_mask, segment_ids, edit_sequence, 
  use_one_hot_embeddings, mode, 
  copy_weight, 
  use_bert_more, 
  insert_ids,
  multitoken_insert_ids,
  subtract_replaced_from_replacement):
  """Creates a classification model."""
  # insert_ids: word ids of unigram inserts (list)
  # multitoken_insert_ids: word_ids of bigram inserts (list of tuples of length 2)
  # Defining the space of all possible edits: 
  # unk, sos and eos are dummy edits mapped to 0, 1 and 2 respectively
  # copy is mapped to 3
  # del is mapped to 4
  num_appends = len(insert_ids) + len(multitoken_insert_ids)
  num_replaces = num_appends # appends and replacements come from the same set (inserts and multitoken_inserts)
  append_begin = 5 # First append edit (mapped to 5)
  append_end = append_begin + num_appends - 1 #Last append edit
  rep_begin = append_end + 1 # First replace edit
  rep_end = rep_begin + num_replaces - 1 #Last replace edit  
  num_suffix_transforms = 58 #num of transformation edits
  num_labels = 5 + num_appends + num_replaces + num_suffix_transforms # total number of edits
  print("************ num of labels : {} ***************".format(num_labels))

  config = bert_config
  input_sequence_shape = modeling.get_shape_list(input_sequence,2)
  batch_size = input_sequence_shape[0]
  seq_len = input_sequence_shape[1]

  if not use_bert_more:  #default use of bert (without logit factorisation)
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_sequence,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()
  else:                 # LOGIT FACTORISATION is On!
    model = modified_modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_sequence,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()
    replace_layer = output_layer[:,seq_len:2*seq_len,:]  #representation of replacement slots as described in paper
    append_layer = output_layer[:,2*seq_len:3*seq_len,:] #representation of append slots as described in paper
    output_layer = output_layer[:,0:seq_len,:]

  output_layer_shape = modeling.get_shape_list(output_layer,3)
  hidden_size = output_layer_shape[-1]

  flattened_output_layer = tf.reshape(output_layer,[-1, hidden_size])

  h_edit = flattened_output_layer

  if use_bert_more:
    h_word = flattened_output_layer
    flattened_replace_layer = tf.reshape(replace_layer,[-1, hidden_size])
    flattened_append_layer = tf.reshape(append_layer, [-1, hidden_size])

    m_replace = flattened_replace_layer    
    m_append = flattened_append_layer

    
    with tf.variable_scope("cls/predictions"):
      with tf.variable_scope("transform"):
        h_word = tf.layers.dense(
            h_word,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        h_word = modeling.layer_norm(h_word)

    with tf.variable_scope("cls/predictions",reuse=True):
      with tf.variable_scope("transform",reuse=True):
        m_replace = tf.layers.dense(
            m_replace,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        m_replace = modeling.layer_norm(m_replace)

    with tf.variable_scope("cls/predictions",reuse=True):
      with tf.variable_scope("transform",reuse=True):
        m_append = tf.layers.dense(
            m_append,
            units=bert_config.hidden_size,
            activation=modeling.get_activation(bert_config.hidden_act),
            kernel_initializer=modeling.create_initializer(
                bert_config.initializer_range))
        m_append = modeling.layer_norm(m_append)
    
    word_embedded_input = model.word_embedded_input
    flattened_word_embedded_input = tf.reshape(word_embedded_input, [-1, hidden_size])    

  labels = edit_sequence
  
  edit_weights = tf.get_variable(
      "edit_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  if is_training:
    h_edit = tf.nn.dropout(h_edit, keep_prob=0.9) 

  if use_bert_more:
      # append/replace weight vector for a given append or replace operation
      # correspond to word embedding for its token argument
      # for multitoken append/replace (e.g. has been)
      # weight vector is sum of word embeddings of token arguments

      append_weights = edit_word_embedding_lookup(model.embedding_table, insert_ids,
       use_one_hot_embeddings, config.vocab_size, config.hidden_size)
      replace_weights = append_weights #tokens in replace and append vocab are same 
                                       #(i.e. inserts and multitoken_inserts)

      multitoken_append_weights = wem_utils.edit_embedding_loopkup(model.embedding_table, multitoken_insert_ids,
                        use_one_hot_embeddings, config.vocab_size, config.hidden_size)
      multitoken_replace_weights = multitoken_append_weights #tokens in replace and append vocab are same 
                                                             #(i.e. inserts and multitoken_inserts)

      append_weights = tf.concat([append_weights, multitoken_append_weights],0)
      replace_weights = tf.concat([replace_weights, multitoken_replace_weights],0)

  with tf.variable_scope("loss"):
    edit_logits = tf.matmul(h_edit, edit_weights, transpose_b=True) #first term in eq3 in paper
    logits = edit_logits
    if use_bert_more:

      #=============== inplace_word_logits==============# #2nd term in eq3 in paper
      inplace_logit = tf.reduce_sum(h_word * flattened_word_embedded_input, axis=1, keepdims=True) #copy
      #inplace_logit = tf.reduce_sum(m_replace * flattened_word_embedded_input, axis=1, keepdims=True) #copy
      inplace_logit_appends = tf.tile(inplace_logit,[1,num_appends])
      inplace_logit_transforms = tf.tile(inplace_logit,[1,num_suffix_transforms])
      zero_3_logits = tf.zeros([batch_size*seq_len,3]) #unk sos eos 
      zero_1_logits = tf.zeros([batch_size*seq_len,1]) # del
      zero_replace_logits = tf.zeros([batch_size*seq_len,num_replaces])

      concat_list = [zero_3_logits, inplace_logit, zero_1_logits]\
                  + [inplace_logit_appends]\
                  + [zero_replace_logits]\
                  + [inplace_logit_transforms]

      inplace_word_logits = tf.concat(concat_list,1)

      #======additional (insert,replace) logits ====# #3rd term in eqn3 in paper
      zero_5_logits = tf.zeros([batch_size*seq_len,5])      
      append_logits = tf.matmul(m_append, append_weights, transpose_b=True)

      if subtract_replaced_from_replacement:
        replace_logits = replacement_minus_replaced_logits(m_replace, 
          flattened_word_embedded_input, replace_weights)
      else:
        replace_logits = tf.matmul(m_replace, replace_weights, transpose_b=True)
      
      suffix_logits  = tf.zeros([batch_size*seq_len,num_suffix_transforms])
      
      concat_list = [zero_5_logits, append_logits, replace_logits, suffix_logits]
      additional_logits = tf.concat(concat_list,1)
      #====================================================#

      logits = edit_logits + inplace_word_logits + additional_logits
      logits_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer())
      logits += logits_bias
    
    logits = tf.reshape(logits, [output_layer_shape[0], output_layer_shape[1], num_labels])
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    probs = tf.nn.softmax(logits,axis=-1)
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    per_token_loss = per_token_loss * tf.to_float(input_mask)
    mask = copy_weight*tf.to_float(tf.equal(labels,3)) +  tf.to_float(tf.not_equal(labels,3))
    masked_per_token_loss = per_token_loss * mask
    per_example_loss = tf.reduce_sum(masked_per_token_loss, axis=-1)
    loss = tf.reduce_mean(per_example_loss)            

    return (loss, per_example_loss, logits, probs)

Exemple #34

0

Afficher le fichier

def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()#Bertの最終層

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])

  #ここをTransformerにする
  """
  output_weights = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)
  """
  #Transformer層
  #bertの中のtransformerよりずっとスペック低くしている
  transformer_outputs = modeling.transformer_model(input_tensor=final_hidden_matrix,
                              attention_mask=None,
                              hidden_size=5,
                              num_hidden_layers=2,
                              num_attention_heads=2,
                              intermediate_size=20,
                              intermediate_act_fn=modeling.gelu,
                              hidden_dropout_prob=0.1,
                              attention_probs_dropout_prob=0.1,
                              initializer_range=0.02,
                              do_return_all_layers=False)#現状Falseのみ

  #線型層
  output_weights = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_weights", [30000, 5],
      initializer=tf.truncated_normal_initializer(stddev=0.02))
  output_bias = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_bias", [30000], initializer=tf.zeros_initializer())
  logits = tf.matmul(transformer_outputs, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  #max
  ids = tf.reduce_max(logits,axis=0)

  #Transformerのテンソルとidを出力。損失を測るのに両方使うため
  return (ids,transformer_outputs)

Exemple #35

0

Afficher le fichier

Fichier : run_squad.py Projet : Wanke15/bert

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (start_logits, end_logits) = create_model(
        bert_config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      seq_length = modeling.get_shape_list(input_ids)[1]

      def compute_loss(logits, positions):
        one_hot_positions = tf.one_hot(
            positions, depth=seq_length, dtype=tf.float32)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        loss = -tf.reduce_mean(
            tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
        return loss

      start_positions = features["start_positions"]
      end_positions = features["end_positions"]

      start_loss = compute_loss(start_logits, start_positions)
      end_loss = compute_loss(end_logits, end_positions)

      total_loss = (start_loss + end_loss) / 2.0

      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.PREDICT:
      predictions = {
          "unique_ids": unique_ids,
          "start_logits": start_logits,
          "end_logits": end_logits,
      }
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    else:
      raise ValueError(
          "Only TRAIN and PREDICT modes are supported: %s" % (mode))

    return output_spec