Beispiel #1
0
 def __bert_embedding(self, token_ids, token_masks, segment_ids, masks, keep_prob=0.8):
     """Compute BERT embeddings 
     """
     from bert import modeling
     bert_model = modeling.BertModel(
         config=self.bert_config,
         is_training=self.is_training,
         input_ids=token_ids,
         input_mask=token_masks,
         token_type_ids=segment_ids,
         use_one_hot_embeddings=False)
     bert_embeddings = bert_model.get_sequence_output()  # (batch_size, bert_max_seq_length, bert_embedding_size)
     # initialize pre-trained bert
     if self.is_training and self.bert_init_checkpoint:
         tvars = tf.trainable_variables()
         (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, self.bert_init_checkpoint)
         tf.train.init_from_checkpoint(self.bert_init_checkpoint, assignment_map)
         tf.logging.info("**** Trainable Variables ****")
         for var in tvars:
             init_string = ""
             if var.name in initialized_variable_names:
                 init_string = ", *INIT_FROM_CKPT*"
             tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape, init_string)
     return tf.nn.dropout(bert_embeddings, keep_prob)
Beispiel #2
0
def create_model(bert_config, is_training, input_ids, input_mask,
                 segment_ids, labels, num_labels, use_one_hot_embeddings):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings
    )
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    embedding = model.get_sequence_output()
    max_seq_length = embedding.shape[1].value

    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度

    crf = CRF(embedded_chars=embedding, droupout_rate=FLAGS.droupout_rate,
                    initializers=initializers, num_labels=num_labels,
                    seq_length=max_seq_length, labels=labels, lengths=lengths,
                    is_training=is_training)
    rst = crf.add_crf_layer()
    return rst
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()

    hidden_size = output_layer.shape[-1].value

    output_weight = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())
    with tf.variable_scope("loss"):
        if is_training:
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        output_layer = tf.reshape(output_layer, [-1, hidden_size])
        logits = tf.matmul(output_layer, output_weight, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, 21])
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_sum(per_example_loss)

        probabilities = tf.nn.softmax(logits, axis=-1)
        predict = tf.argmax(probabilities, axis=-1)

        return (loss, per_example_loss, logits, predict)
Beispiel #4
0
def create_model(bert_config, is_training, input_ids, input_mask,
                 segment_ids, labels, num_labels, use_one_hot_embeddings):

    is_training_for_bert = is_training
    if FLAGS.use_feature_based: is_training_for_bert = False
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training_for_bert, # False for feature-based, is_training for fine-tuning
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings
    )
    embedding = model.get_sequence_output() # (batch_size, seq_length, embedding_size)
    if is_training:
        # dropout embedding
        embedding = tf.layers.dropout(embedding, rate=FLAGS.bert_dropout_rate, training=is_training)
    embedding_size = embedding.shape[-1].value # embedding_size
    seq_length = embedding.shape[1].value

    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(used, reduction_indices=1)  # (batch_size)
    print('seq_length', seq_length)
    print('lengths', lengths)

    def bi_lstm_fused(inputs, lengths, rnn_size, is_training, dropout_rate=0.5, scope='bi-lstm-fused'):
        with tf.variable_scope(scope):
            t = tf.transpose(inputs, perm=[1, 0, 2])  # Need time-major
            lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(rnn_size)
            lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(rnn_size)
            lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
            output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=lengths)
            output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=lengths)
            outputs = tf.concat([output_fw, output_bw], axis=-1)
            outputs = tf.transpose(outputs, perm=[1, 0, 2])
            return tf.layers.dropout(outputs, rate=dropout_rate, training=is_training)

    def lstm_layer(inputs, lengths, is_training):
        rnn_output = tf.identity(inputs)
        for i in range(2):
            scope = 'bi-lstm-fused-%s' % i
            rnn_output = bi_lstm_fused(rnn_output,
                                       lengths,
                                       rnn_size=FLAGS.lstm_size,
                                       is_training=is_training,
                                       dropout_rate=FLAGS.bilstm_dropout_rate,
                                       scope=scope)  # (batch_size, seq_length, 2*rnn_size)
        return rnn_output

    def project_layer(inputs, out_dim, seq_length, scope='project'):
        with tf.variable_scope(scope):
            in_dim = inputs.get_shape().as_list()[-1]
            weight = tf.get_variable('W', shape=[in_dim, out_dim],
                                     dtype=tf.float32, initializer=initializers.xavier_initializer())
            bias = tf.get_variable('b', shape=[out_dim], dtype=tf.float32,
                                    initializer=tf.zeros_initializer())
            t_output = tf.reshape(inputs, [-1, in_dim])            # (batch_size*seq_length, in_dim)
            output = tf.matmul(t_output, weight) + bias            # (batch_size*seq_length, out_dim)
            output = tf.reshape(output, [-1, seq_length, out_dim]) # (batch_size, seq_length, out_dim)
            return output

    def loss_layer(logits, labels, num_labels, lengths, input_mask):
        trans = tf.get_variable(
            "transitions",
            shape=[num_labels, num_labels],
            initializer=initializers.xavier_initializer())
        if FLAGS.use_crf:
            with tf.variable_scope("crf-loss"):
                log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
                    inputs=logits,
                    tag_indices=labels,
                    transition_params=trans,
                    sequence_lengths=lengths)
                per_example_loss = -log_likelihood
                loss = tf.reduce_mean(per_example_loss)
                return loss, per_example_loss, trans
        else:
            labels_one_hot = tf.one_hot(labels, num_labels)
            cross_entropy = labels_one_hot * tf.log(tf.nn.softmax(logits))
            cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
            cross_entropy *= tf.to_float(input_mask)
            cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
            cross_entropy /= tf.cast(lengths, tf.float32)
            per_example_loss = cross_entropy
            loss = tf.reduce_mean(per_example_loss)
            return loss, per_example_loss, trans
    '''    
    # 1
    logits = project_layer(embedding, num_labels, seq_length, scope='project')
    '''
    '''
    # 2
    lstm_outputs = lstm_layer(embedding, lengths, is_training)
    p1 = project_layer(lstm_outputs, FLAGS.lstm_size, seq_length, scope='project-1')
    p2 = project_layer(p1, num_labels, seq_length, scope='project-2')
    logits = p2
    '''
    # 3
    lstm_outputs = lstm_layer(embedding, lengths, is_training)
    logits = project_layer(lstm_outputs, num_labels, seq_length, scope='project')
    loss, per_example_loss, trans = loss_layer(logits, labels, num_labels, lengths, input_mask)
    if FLAGS.use_crf:
        pred_ids, _ = crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=lengths)
    else:
        probabilities = tf.nn.softmax(logits, axis=-1)
        pred_ids = tf.argmax(probabilities,axis=-1)

    # masking for confirmation
    pred_ids *= input_mask

    print('#' * 20)
    print('shape of output_layer:', embedding.shape)
    print('embedding_size:%d' % embedding_size)
    print('seq_length:%d' % seq_length)
    print('shape of logit', logits.shape)
    print('shape of loss', loss.shape)
    print('shape of per_example_loss', per_example_loss.shape)
    print('num labels:%d' % num_labels)
    print('#' * 20)
    return (loss, per_example_loss, logits, trans, pred_ids)
Beispiel #5
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, multilabel, sent_rels, sentiment,
                 entailment_rels, entailment, corr_rels, correlation):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids)

  # Here, we are doing a classification task on the entire segment. For
  # token-level output, use model.get_sequece_output() instead.
  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):
    if is_training:
      # I.e., 0.1 dropout
      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)

    # with open('Debug_file_1.txt', 'a+') as infile:
    # 	print(logits, file=infile)

    # Labels both for single and multilabel classification
    labels = tf.cast(labels, tf.float32)

    if multilabel:
      probabilities = tf.nn.sigmoid(logits)
      tf.logging.info("num_labels:{};logits:{};labels:{}".format(
          num_labels, logits, labels))
      per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(
          labels=labels, logits=logits)
    else:
      probabilities = tf.nn.softmax(logits, axis=-1)
      per_example_loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=labels, logits=logits)
    loss = tf.reduce_mean(per_example_loss)

    # Add regularization based on label relations prior
    probs_exp = tf.expand_dims(probabilities, 1)
    m = tf.tile(probs_exp, [1, num_labels, 1])
    probs_exp_t = tf.transpose(probs_exp, perm=[0, 2, 1])

    # Subtract each prediction from all others:
    # Example (with batch size=1):
    #     tiled predictions: [0.1] [0.1] [0.1]
    #                        [0.2] [0.2] [0.2]
    #                        [0.3] [0.3] [0.3]
    #     subtract [0.1, 0.2, 0.3] row-wise
    #     result:   [0.0] [-.1] [-.2] --> row represents difference between
    #                                     emotion 1 and all other emotions
    #               [0.1] [0.0] [-.1]
    #               [0.2] [0.1] [0.0]
    dists = tf.square(tf.subtract(m, probs_exp_t))  # square distances
    dists = tf.transpose(dists, perm=[0, 2, 1])

    # Sentiment-based regularization
    sent_reg = tf.multiply(
        tf.constant(sentiment),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(sent_rels, dtype=tf.float32))))
    tf.summary.scalar("sentiment_regularization", sent_reg)
    loss += sent_reg

    # Entailment-based regularization
    ent_reg = tf.multiply(
        tf.constant(entailment),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(entailment_rels, dtype=tf.float32))))
    tf.summary.scalar("entailment_regularization", ent_reg)
    loss += ent_reg

    # Correlation-based regularization
    corr_reg = tf.multiply(
        tf.constant(correlation),
        tf.reduce_mean(
            tf.multiply(dists, tf.constant(corr_rels, dtype=tf.float32))))
    tf.summary.scalar("correlation_regularization", corr_reg)
    loss += corr_reg

    tf.summary.scalar("loss", loss)

    return (loss, per_example_loss, output_layer, logits, probabilities)
Beispiel #6
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, pos_embedding, dp_embedding, num_labels,
                 use_one_hot_embeddings):  # 这里是构建模型的重点,需要改变
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    output_layer = model.get_sequence_output()  # 计算图:获得bert的输出
    '''
    output_layer : float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.
    '''
    output_layer = tf.concat([output_layer, pos_embedding, dp_embedding], -1)
    hidden_size = output_layer.shape[-1].value

    output_weight = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())
    with tf.variable_scope("loss"):
        if is_training:
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        output_layer = tf.reshape(output_layer, [-1, hidden_size])
        logits = tf.matmul(output_layer, output_weight,
                           transpose_b=True)  # (1024, 788) * (7, 788)^T 维度缩减
        logits = tf.nn.bias_add(logits, output_bias)  # (1024,7) + (7,)
        logits = tf.reshape(
            logits, [-1, FLAGS.max_seq_length, num_labels])  # 维度还原(8, 128, 7)

        if is_training:
            length = tf.constant(FLAGS.max_seq_length,
                                 shape=[
                                     FLAGS.train_batch_size,
                                 ],
                                 dtype=tf.int32)
        else:
            length = tf.constant(FLAGS.max_seq_length,
                                 shape=[
                                     FLAGS.eval_batch_size,
                                 ],
                                 dtype=tf.int32)

        # 注意!!!!crf要求每个batch都是足量的,所以使用时要么丢弃最后一个不足量的batch,要么补足
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(  # 得到最大然似和转移矩阵
            inputs=logits,  # 输入的特征向量 [batch_size, max_seq_len, num_tags]
            tag_indices=labels,  # 目标标签 [batch_size, max_seq_len]   #空的转移矩阵
            sequence_lengths=length)  # (batch,)每个序列的长度
        predict, viterbi_score = tf.contrib.crf.crf_decode(
            logits, transition_params, length)
        loss = tf.reduce_mean(-log_likelihood)

        #
        # log_probs = tf.nn.log_softmax(logits, axis=-1)      #计算对数然似损失,与logit维数相同(8, 128, 7)
        # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)  #独热向量(7, 7)
        # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)  #相乘后,第三维求和,得到(8, 128)
        # loss = tf.reduce_sum(per_example_loss)      #整个batch的loss
        # probabilities = tf.nn.softmax(logits, axis=-1)      #计算最大然似损失,得到各标签概率,(8, 128, 7)
        # predict = tf.argmax(probabilities, axis=-1)         #取第3维最大值为预测结果

        return (loss, logits, predict)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 token_label_ids, predicate_label_id, num_token_labels,
                 num_predicate_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # We "pool" the model by simply taking the hidden state corresponding
    # to the first token. float Tensor of shape [batch_size, hidden_size]
    predicate_output_layer = model.get_pooled_output()

    intent_hidden_size = predicate_output_layer.shape[-1].value

    predicate_output_weights = tf.get_variable(
        "predicate_output_weights", [num_predicate_labels, intent_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    predicate_output_bias = tf.get_variable("predicate_output_bias",
                                            [num_predicate_labels],
                                            initializer=tf.zeros_initializer())

    with tf.variable_scope("predicate_loss"):
        if is_training:
            # I.e., 0.1 dropout
            predicate_output_layer = tf.nn.dropout(predicate_output_layer,
                                                   keep_prob=0.9)

        predicate_logits = tf.matmul(predicate_output_layer,
                                     predicate_output_weights,
                                     transpose_b=True)
        predicate_logits = tf.nn.bias_add(predicate_logits,
                                          predicate_output_bias)
        predicate_probabilities = tf.nn.softmax(predicate_logits, axis=-1)
        predicate_prediction = tf.argmax(predicate_probabilities,
                                         axis=-1,
                                         output_type=tf.int32)
        predicate_labels = tf.one_hot(predicate_label_id,
                                      depth=num_predicate_labels,
                                      dtype=tf.float32)
        predicate_per_example_loss = tf.reduce_sum(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=predicate_logits,
                                                    labels=predicate_labels),
            -1)
        predicate_loss = tf.reduce_mean(predicate_per_example_loss)

    #     """Gets final hidden layer of encoder.
    #
    #     Returns:
    #       float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
    #       to the final hidden of the transformer encoder.
    #     """
    token_label_output_layer = model.get_sequence_output()

    token_label_hidden_size = token_label_output_layer.shape[-1].value

    token_label_output_weight = tf.get_variable(
        "token_label_output_weights",
        [num_token_labels, token_label_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    token_label_output_bias = tf.get_variable(
        "token_label_output_bias", [num_token_labels],
        initializer=tf.zeros_initializer())
    with tf.variable_scope("token_label_loss"):
        if is_training:
            token_label_output_layer = tf.nn.dropout(token_label_output_layer,
                                                     keep_prob=0.9)
        token_label_output_layer = tf.reshape(token_label_output_layer,
                                              [-1, token_label_hidden_size])
        token_label_logits = tf.matmul(token_label_output_layer,
                                       token_label_output_weight,
                                       transpose_b=True)
        token_label_logits = tf.nn.bias_add(token_label_logits,
                                            token_label_output_bias)

        token_label_logits = tf.reshape(
            token_label_logits, [-1, FLAGS.max_seq_length, num_token_labels])
        token_label_log_probs = tf.nn.log_softmax(token_label_logits, axis=-1)
        token_label_one_hot_labels = tf.one_hot(token_label_ids,
                                                depth=num_token_labels,
                                                dtype=tf.float32)
        token_label_per_example_loss = -tf.reduce_sum(
            token_label_one_hot_labels * token_label_log_probs, axis=-1)
        token_label_loss = tf.reduce_sum(token_label_per_example_loss)
        token_label_probabilities = tf.nn.softmax(token_label_logits, axis=-1)
        token_label_predictions = tf.argmax(token_label_probabilities, axis=-1)
        # return (token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predict)

    loss = 0.5 * predicate_loss + token_label_loss
    return (loss, predicate_loss, predicate_per_example_loss,
            predicate_probabilities, predicate_prediction, token_label_loss,
            token_label_per_example_loss, token_label_logits,
            token_label_predictions)
def create_original_varmisuse_model(
    bert_config,
    is_training,
    enable_sequence_masking,
    input_ids,
    input_mask,
    segment_ids,
    candidate_mask,
    target_mask,
    error_location_mask,
    use_one_hot_embeddings,
    multi_head_count = 2,
):
  """Creates a two-headed pointer model."""

  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_sequence = model.get_sequence_output()

  final_sequence_shape = modeling.get_shape_list(final_sequence,
                                                 expected_rank=3)
  batch_size, sequence_length, hidden_size = final_sequence_shape

  cls_output = model.get_pooled_output()

  # Calculate pointer probabilities as the attention vector over program tokens.
  # Pointer network equations:
  # (1) M = tanh(Y * Wy_extend + h_extend * Wh_extend)
  # (2) multi_headed_alpha = softmax(M * w_extend)
  # Vector shapes:
  #  (1) M:     [batch_size, sequence_length, hidden_size]
  #  (2) Wy:    [hidden_size, hidden_size]
  #  (3) Wh:    [hidden_size, hidden_size]
  #  (4) h:     [batch_size, hidden_size]
  #  (5) Y:     [batch_size, sequence_length, hidden_size]
  #  (6) w:     [hidden_size, multi_head_count]
  #  (7) multi_headed_alpha: [batch_size, sequence_length, multi_head_count]
  #  (8) Wy_extend: Wy extended to [batch_size, hidden_size, hidden_size]
  #  (9) Wh_extend: Wh extended to [batch_size, hidden_size, hidden_size]
  # (10) h_extend: h extended to [batch_size, sequence_length, hidden_size]
  # (11) w_extend: w extended to [batch_size, hidden_size, multi_head_count]

  wy = tf.get_variable(
      "Wy",
      shape=[hidden_size, hidden_size],
      dtype=tf.float32,
      initializer=contrib.layers.xavier_initializer())
  wh = tf.get_variable(
      "Wh",
      shape=[hidden_size, hidden_size],
      dtype=tf.float32,
      initializer=contrib.layers.xavier_initializer())
  w = tf.get_variable(
      "w",
      shape=[hidden_size, multi_head_count],
      dtype=tf.float32,
      initializer=contrib.layers.xavier_initializer())

  # Dimensions: [batch_size, hidden_size, hidden_size]
  wy_extend = tf.tile(tf.expand_dims(wy, 0), [batch_size, 1, 1])
  # Dimensions: [batch_size, hidden_size, hidden_size]
  wh_extend = tf.tile(tf.expand_dims(wh, 0), [batch_size, 1, 1])
  # Dimensions: [batch_size, sequence_length, hidden_size]
  cls_output_extend = tf.tile(
      tf.expand_dims(cls_output, 1), [1, sequence_length, 1])

  candidate_mask_expanded = tf.expand_dims(candidate_mask, 2)
  if enable_sequence_masking:
    # Mask sequence using `candidate_mask`.
    candidates_mask_extend = tf.tile(candidate_mask_expanded,
                                     [1, 1, hidden_size])
    final_sequence_masked = tf.multiply(final_sequence,
                                        tf.to_float(candidates_mask_extend))
    m = tf.tanh(
        tf.matmul(final_sequence_masked, wy_extend) +
        tf.matmul(cls_output_extend, wh_extend))
  else:
    m = tf.tanh(
        tf.matmul(final_sequence, wy_extend) +
        tf.matmul(cls_output_extend, wh_extend))

  # Dimension: [batch_size, hidden_size, multi_head_count]
  w_extend = tf.tile(tf.expand_dims(w, 0), [batch_size, 1, 1])

  # Dimension: [batch_size, sequence_length, multi_head_count]
  logits = tf.matmul(m, w_extend)

  # Dimension: [batch_size, sequence_length, multi_head_count]
  candidates_mask_extend_to_heads = tf.tile(candidate_mask_expanded,
                                            [1, 1, multi_head_count])

  # Mask logits using `candidate_mask`.
  logits_masked = tf.multiply(
      logits, tf.to_float(candidates_mask_extend_to_heads))
  probabilities = tf.nn.softmax(logits_masked, axis=1)

  location_probabilities, repair_probabilities = tf.unstack(
      probabilities, axis=2)

  def compute_loss(labels, probabilities):
    return -tf.reduce_sum(
        tf.multiply(tf.to_float(labels),
                    tf.log(tf.clip_by_value(probabilities, 1e-10, 1.0))),
        axis=1)

  localization_loss = compute_loss(error_location_mask,
                                   location_probabilities)
  repair_loss = compute_loss(target_mask, repair_probabilities)

  per_example_loss = localization_loss + repair_loss

  loss = tf.reduce_mean(per_example_loss)

  return loss, per_example_loss, logits_masked, probabilities
    def __init__(self, bert_config, num_labels, seq_length, init_checkpoint):
        self.bert_config = bert_config
        self.num_labels = num_labels
        self.seq_length = seq_length
        self.tower_grads = []
        self.losses = []

        self.input_ids = tf.placeholder(tf.int32, [None, self.seq_length],
                                        name='input_ids')
        self.input_mask = tf.placeholder(tf.int32, [None, self.seq_length],
                                         name='input_mask')
        self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_length],
                                          name='segment_ids')
        self.labels = tf.placeholder(tf.int32, [None], name='labels')
        self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
        self.is_training = tf.placeholder(tf.bool,
                                          shape=[],
                                          name='is_training')
        print(self.batch_size)
        self.gpu_step = self.batch_size // gpu_nums

        global_step = tf.train.get_or_create_global_step()

        learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

        # Implements linear decay of the learning rate.
        learning_rate = tf.train.polynomial_decay(learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        if num_warmup_steps:
            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = init_lr * warmup_percent_done

            is_warmup = tf.cast(global_steps_int < warmup_steps_int,
                                tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate +
                             is_warmup * warmup_learning_rate)

        optimizer = optimization.AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
            pred = []
            label = []
            for d in range(gpu_nums):
                with tf.device("/gpu:%s" % d), tf.name_scope("%s_%s" %
                                                             ("tower", d)):
                    self.model = modeling.BertModel(
                        config=self.bert_config,
                        is_training=self.is_training,
                        input_ids=self.input_ids[d * self.gpu_step:(d + 1) *
                                                 self.gpu_step],
                        input_mask=self.input_mask[d * self.gpu_step:(d + 1) *
                                                   self.gpu_step],
                        token_type_ids=self.segment_ids[d *
                                                        self.gpu_step:(d + 1) *
                                                        self.gpu_step])
                    print("GPU:", d)

                    tvars = tf.trainable_variables()
                    initialized_variable_names = {}
                    if init_checkpoint:
                        (assignment_map, initialized_variable_names
                         ) = modeling.get_assignment_map_from_checkpoint(
                             tvars, init_checkpoint)
                        tf.train.init_from_checkpoint(init_checkpoint,
                                                      assignment_map)

                    logging.info("**** Trainable Variables ****")
                    for var in tvars:
                        init_string = ""
                        if var.name in initialized_variable_names:
                            init_string = ", *INIT_FROM_CKPT*"
                        logging.info("  name = %s, shape = %s%s", var.name,
                                     var.shape, init_string)

                    output_layer = self.model.get_pooled_output()
                    logging.info(output_layer)

                    if self.is_training == True:
                        output_layer = tf.nn.dropout(output_layer,
                                                     keep_prob=0.9)

                    match_1 = tf.strided_slice(output_layer, [0],
                                               [self.gpu_step], [2])
                    match_2 = tf.strided_slice(output_layer, [1],
                                               [self.gpu_step], [2])

                    match = tf.concat([match_1, match_2], 1)

                    self.logits = tf.layers.dense(match,
                                                  self.num_labels,
                                                  name='fc',
                                                  reuse=tf.AUTO_REUSE)

                    #预测标签
                    self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits),
                                                1,
                                                name="pred")
                    logging.info(self.y_pred_cls)

                    #真实标签
                    self.r_labels = tf.strided_slice(
                        self.labels[d * self.gpu_step:(d + 1) * self.gpu_step],
                        [0], [self.gpu_step], [2])
                    logging.info(self.r_labels)

                    one_hot_labels = tf.one_hot(self.r_labels,
                                                depth=self.num_labels,
                                                dtype=tf.float32)

                    log_probs = tf.nn.log_softmax(self.logits, axis=-1)
                    per_example_loss =  - (30*one_hot_labels[:,0] * log_probs[:,0]) \
                                        - (9*one_hot_labels[:,1] * log_probs[:,1]) \
                                        - (2*one_hot_labels[:,2] * log_probs[:,2]) \
                                        - (2*one_hot_labels[:,3] * log_probs[:,3]) \
                                        - (9*one_hot_labels[:,4] * log_probs[:,4]) \
                                        + 1e-10

                    self.loss = tf.reduce_mean(per_example_loss)

                    #self.optim = optimization.create_optimizer(self.loss, learning_rate, num_train_steps, num_warmup_steps, False)

                    tvars = tf.trainable_variables()
                    grads = tf.gradients(self.loss, tvars)

                    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

                    self.tower_grads.append(list(zip(grads, tvars)))
                    self.losses.append(self.loss)
                    label.append(self.r_labels)
                    pred.append(self.y_pred_cls)
                outer_scope.reuse_variables()

        with tf.name_scope("apply_gradients"), tf.device("/cpu:0"):
            gradients = self.average_gradients(self.tower_grads)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
            new_global_step = global_step + 1
            self.train_op = tf.group(train_op,
                                     [global_step.assign(new_global_step)])
            self.losses = tf.reduce_mean(self.losses)
            self.pred = tf.concat(pred, 0)
            self.label = tf.concat(label, 0)
            logging.info(self.pred)
            logging.info(self.label)
Beispiel #10
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = features["next_sentence_labels"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            scope="bert")

        (masked_lm_loss, masked_lm_example_loss,
         masked_lm_log_probs) = get_masked_lm_output(
             bert_config, model.get_sequence_output(),
             model.get_embedding_table(), masked_lm_positions, masked_lm_ids,
             masked_lm_weights)
        """
    (next_sentence_loss, next_sentence_example_loss,
     next_sentence_log_probs) = get_next_sentence_output(
         bert_config, model.get_pooled_output(), next_sentence_labels)
    """

        # total_loss = masked_lm_loss + next_sentence_loss
        total_loss = masked_lm_loss

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:
            """
      def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                    masked_lm_weights, next_sentence_example_loss,
                    next_sentence_log_probs, next_sentence_labels):
        # Computes the loss and accuracy of the model.
        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
                                         [-1, masked_lm_log_probs.shape[-1]])
        masked_lm_predictions = tf.argmax(
            masked_lm_log_probs, axis=-1, output_type=tf.int32)
        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
        masked_lm_accuracy = tf.metrics.accuracy(
            labels=masked_lm_ids,
            predictions=masked_lm_predictions,
            weights=masked_lm_weights)
        masked_lm_mean_loss = tf.metrics.mean(
            values=masked_lm_example_loss, weights=masked_lm_weights)

        next_sentence_log_probs = tf.reshape(
            next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
        next_sentence_predictions = tf.argmax(
            next_sentence_log_probs, axis=-1, output_type=tf.int32)
        next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
        next_sentence_accuracy = tf.metrics.accuracy(
            labels=next_sentence_labels, predictions=next_sentence_predictions)
        next_sentence_mean_loss = tf.metrics.mean(
            values=next_sentence_example_loss)

        return {
            "masked_lm_accuracy": masked_lm_accuracy,
            "masked_lm_loss": masked_lm_mean_loss,
            "next_sentence_accuracy": next_sentence_accuracy,
            "next_sentence_loss": next_sentence_mean_loss,
        }
    """
            def metric_fn(masked_lm_example_loss, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights):
                """Computes the loss and accuracy of the model."""
                masked_lm_log_probs = tf.reshape(
                    masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]])
                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(masked_lm_example_loss,
                                                    [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss
                }

            eval_metrics = (metric_fn, [
                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                masked_lm_weights
            ])
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and EVAL modes are supported: %s" %
                             (mode))

        return output_spec
Beispiel #11
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 start_labels, end_labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    embedding = model.get_sequence_output(
    )  # BERT模型输出的embedding  [batch_size,max_seq_len,embedding_size]
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10
    dim = embedding.get_shape().as_list()[-1]
    # 125膨胀卷积
    embedding = tf.layers.conv1d(embedding,
                                 filters=dim,
                                 kernel_size=3,
                                 padding="same",
                                 dilation_rate=1)
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10
    embedding = tf.layers.conv1d(embedding,
                                 filters=dim,
                                 kernel_size=3,
                                 padding="same",
                                 dilation_rate=2)
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10
    embedding = tf.layers.conv1d(embedding,
                                 filters=dim,
                                 kernel_size=3,
                                 padding="same",
                                 dilation_rate=5)
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10

    embedding = tf.layers.conv1d(embedding,
                                 filters=dim,
                                 kernel_size=3,
                                 padding="same",
                                 dilation_rate=1)
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10
    embedding = tf.layers.conv1d(embedding,
                                 filters=dim,
                                 kernel_size=3,
                                 padding="same",
                                 dilation_rate=2)
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10
    embedding = tf.layers.conv1d(embedding,
                                 filters=dim,
                                 kernel_size=3,
                                 padding="same",
                                 dilation_rate=5)
    embedding -= (1.0 -
                  tf.cast(tf.expand_dims(input_mask, 2), tf.float32)) * 1e10

    avgpool = tf.layers.average_pooling1d(embedding,
                                          pool_size=dim,
                                          padding="same",
                                          strides=1)

    # 一维卷积 relu激活
    output = tf.layers.conv1d(
        avgpool,
        filters=128,
        kernel_size=3,
        activation=tf.nn.relu,
        padding="same")  # [batch_size, max_seq_length, 128]

    # logits and loss
    start_logits = tf.layers.dense(
        output, units=num_labels)  # [batch_size, max_seq_length, num_labels]
    end_logits = tf.layers.dense(
        output, units=num_labels)  # [batch_size, max_seq_length, num_labels]

    start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=start_labels, logits=start_logits)
    end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=end_labels, logits=end_logits)
    start_loss = tf.reduce_sum(
        start_loss * tf.to_float(input_mask)) / tf.reduce_sum(
            tf.to_float(input_mask))
    end_loss = tf.reduce_sum(
        end_loss * tf.to_float(input_mask)) / tf.reduce_sum(
            tf.to_float(input_mask))
    loss = 0.5 * start_loss + 0.5 * end_loss
    return loss, start_logits, end_logits
def optimize_graph(logger=None,
                   verbose=False,
                   pooling_strategy=PoolingStrategy.REDUCE_MEAN,
                   max_seq_len=40):
    if not logger:
        logger = set_logger(colored('BERT_VEC', 'yellow'), verbose)
    try:
        # we don't need GPU for optimizing the graph
        # 返回tensorflow并设置日志级别
        tf = import_tf(device_id=0, verbose=verbose)
        from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference

        # allow_soft_placement:自动选择运行设备
        # ConfigProto用来配置Session
        config = tf.ConfigProto(allow_soft_placement=True)
        config_fp = args.config_name
        init_checkpoint = args.ckpt_name
        logger.info('model config: %s' % config_fp)

        # 加载bert配置文件
        with tf.gfile.GFile(config_fp, 'r') as f:
            bert_config = modeling.BertConfig.from_dict(json.load(f))

        logger.info('build graph...')
        # input placeholders, not sure if they are friendly to XLA
        input_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_ids')
        input_mask = tf.placeholder(tf.int32, (None, max_seq_len),
                                    'input_mask')
        input_type_ids = tf.placeholder(tf.int32, (None, max_seq_len),
                                        'input_type_ids')

        # xla加速
        jit_scope = tf.contrib.compiler.jit.experimental_jit_scope if args.xla else contextlib.suppress

        with jit_scope():
            input_tensors = [input_ids, input_mask, input_type_ids]

            model = modeling.BertModel(config=bert_config,
                                       is_training=False,
                                       input_ids=input_ids,
                                       input_mask=input_mask,
                                       token_type_ids=input_type_ids,
                                       use_one_hot_embeddings=False)

            # 获取所有要训练的变量
            tvars = tf.trainable_variables()

            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)

            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

            minus_mask = lambda x, m: x - tf.expand_dims(1.0 - m, axis=-1
                                                         ) * 1e30
            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_max = lambda x, m: tf.reduce_max(minus_mask(x, m),
                                                           axis=1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(
                mul_mask(x, m), axis=1) / (tf.reduce_sum(
                    m, axis=1, keepdims=True) + 1e-10)

            # 共享卷积核
            with tf.variable_scope("pooling"):
                # 如果只有一层,就只取对应那一层的weight
                if len(args.layer_indexes) == 1:
                    encoder_layer = model.all_encoder_layers[
                        args.layer_indexes[0]]
                else:
                    # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
                    all_layers = [
                        model.all_encoder_layers[l] for l in args.layer_indexes
                    ]
                    encoder_layer = tf.concat(all_layers, -1)

                input_mask = tf.cast(input_mask, tf.float32)

                # 以下代码是句向量的生成方法,可以理解为做了一个卷积的操作,但是没有把结果相加, 卷积核是input_mask
                if pooling_strategy == PoolingStrategy.REDUCE_MEAN:
                    pooled = masked_reduce_mean(encoder_layer, input_mask)
                elif pooling_strategy == PoolingStrategy.REDUCE_MAX:
                    pooled = masked_reduce_max(encoder_layer, input_mask)
                elif pooling_strategy == PoolingStrategy.REDUCE_MEAN_MAX:
                    pooled = tf.concat([
                        masked_reduce_mean(encoder_layer, input_mask),
                        masked_reduce_max(encoder_layer, input_mask)
                    ],
                                       axis=1)
                elif pooling_strategy == PoolingStrategy.FIRST_TOKEN or \
                        pooling_strategy == PoolingStrategy.CLS_TOKEN:
                    pooled = tf.squeeze(encoder_layer[:, 0:1, :], axis=1)
                elif pooling_strategy == PoolingStrategy.LAST_TOKEN or \
                        pooling_strategy == PoolingStrategy.SEP_TOKEN:
                    seq_len = tf.cast(tf.reduce_sum(input_mask, axis=1),
                                      tf.int32)
                    rng = tf.range(0, tf.shape(seq_len)[0])
                    indexes = tf.stack([rng, seq_len - 1], 1)
                    pooled = tf.gather_nd(encoder_layer, indexes)
                elif pooling_strategy == PoolingStrategy.NONE:
                    pooled = mul_mask(encoder_layer, input_mask)
                else:
                    raise NotImplementedError()

            pooled = tf.identity(pooled, 'final_encodes')

            output_tensors = [pooled]
            tmp_g = tf.get_default_graph().as_graph_def()

        # 保存计算图
        with tf.Session(config=config) as sess:
            logger.info('load parameters from checkpoint...')
            sess.run(tf.global_variables_initializer())
            logger.info('freeze...')
            tmp_g = tf.graph_util.convert_variables_to_constants(
                sess, tmp_g, [n.name[:-2] for n in output_tensors])
            dtypes = [n.dtype for n in input_tensors]
            logger.info('optimize...')
            tmp_g = optimize_for_inference(
                tmp_g, [n.name[:-2] for n in input_tensors],
                [n.name[:-2] for n in output_tensors],
                [dtype.as_datatype_enum for dtype in dtypes], False)
        #tmp_file = tempfile.NamedTemporaryFile('w', delete=True).name
        #r = random.randint(1, 1000)
        #tmp_file = "./tmp_graph"+str(r)
        tmp_file = "./tmp_graph11"
        logger.info('write graph to a tmp file: %s' % tmp_file)
        with tf.gfile.GFile(tmp_file, 'wb') as f:
            f.write(tmp_g.SerializeToString())
        return tmp_file
    except Exception as e:
        logger.error('fail to optimize the graph!')
        logger.error(e)
Beispiel #13
0
def create_classification_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels):
    """

    :param bert_config:
    :param is_training:
    :param input_ids:
    :param input_mask:
    :param segment_ids:
    :param labels:
    :param num_labels:
    :param use_one_hot_embedding:
    :return:
    """
    import tensorflow as tf
    from bert_base.bert import modeling
    # 通过传入的训练数据,进行representation
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
    )

    embedding_layer = model.get_sequence_output()
    output_layer = model.get_pooled_output()
    hidden_size = output_layer.shape[-1].value

    # predict = CNN_Classification(embedding_chars=embedding_layer,
    #                                labels=labels,
    #                                num_tags=num_labels,
    #                                sequence_length=FLAGS.max_seq_length,
    #                                embedding_dims=embedding_layer.shape[-1].value,
    #                                vocab_size=0,
    #                                filter_sizes=[3, 4, 5],
    #                                num_filters=3,
    #                                dropout_keep_prob=FLAGS.dropout_keep_prob,
    #                                l2_reg_lambda=0.001)
    # loss, predictions, probabilities = predict.add_cnn_layer()

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        if labels is not None:
            one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            loss = tf.reduce_mean(per_example_loss)
        else:
            loss, per_example_loss = None, None
    return (loss, per_example_loss, logits, probabilities)
    def _create_model(self, mode, input_ids, input_mask, segment_ids, labels,
                      slot_labels, labels_mask, drop_keep_prob,
                      entity_type_ids, sequence_lengths):
        """Creates a LaserTagger model."""
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = modeling.BertModel(
            config=self._config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=self._use_one_hot_embeddings)

        final_layer = model.get_sequence_output()
        # final_hidden = model.get_pooled_output()

        if is_training:
            # I.e., 0.1 dropout
            # final_hidden = tf.nn.dropout(final_hidden, keep_prob=drop_keep_prob)
            final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob)

        # 结合实体信息
        batch_size, seq_length = modeling.get_shape_list(input_ids)

        self.entity_type_embedding = tf.get_variable(
            name="entity_type_embedding",
            shape=(self.entity_type_num, self._config.hidden_size),
            dtype=tf.float32,
            trainable=True,
            initializer=tf.random_uniform_initializer(
                -self._config.initializer_range * 100,
                self._config.initializer_range * 100,
                seed=20))

        with tf.init_scope():
            impact_weight_init = tf.constant(1.0 / self.entity_type_num,
                                             dtype=tf.float32,
                                             shape=(1, self.entity_type_num))
        self.impact_weight = tf.Variable(impact_weight_init,
                                         dtype=tf.float32,
                                         name="impact_weight")  # 不同类型的影响权重
        impact_weight_matrix = tf.tile(self.impact_weight,
                                       multiples=[batch_size * seq_length, 1])

        entity_type_ids_matrix1 = tf.cast(tf.reshape(
            entity_type_ids, [batch_size * seq_length, self.entity_type_num]),
                                          dtype=tf.float32)
        entity_type_ids_matrix = tf.multiply(entity_type_ids_matrix1,
                                             impact_weight_matrix)
        entity_type_emb = tf.matmul(entity_type_ids_matrix,
                                    self.entity_type_embedding)
        final_layer = final_layer + tf.reshape(entity_type_emb, [
            batch_size, seq_length, self._config.hidden_size
        ])  # TODO TODO    # 0.7071067811865476是二分之根号二
        # final_layer = tf.concat([final_layer, tf.reshape(entity_type_emb, [batch_size, seq_length,self._config.hidden_size])], axis=-1)

        if is_training:
            final_layer = tf.nn.dropout(final_layer, keep_prob=drop_keep_prob)

        (output_fw_seq,
         output_bw_seq), ((c_fw, h_fw),
                          (c_bw, h_bw)) = tf.nn.bidirectional_dynamic_rnn(
                              cell_fw=LSTMCell(self.lstm_hidden_size),
                              cell_bw=LSTMCell(self.lstm_hidden_size),
                              inputs=final_layer,
                              sequence_length=sequence_lengths,
                              dtype=tf.float32)
        layer_matrix = tf.concat([output_fw_seq, output_bw_seq], axis=-1)
        final_hidden = tf.concat([c_fw, c_bw], axis=-1)

        layer_matrix = tf.contrib.layers.layer_norm(inputs=layer_matrix,
                                                    begin_norm_axis=-1,
                                                    begin_params_axis=-1)

        intent_logits = tf.layers.dense(
            final_hidden,
            self._num_tags,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            name="output_projection")
        slot_logits = tf.layers.dense(
            layer_matrix,
            self.num_slot_tags,
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
            name="slot_projection")

        with tf.variable_scope("loss"):
            loss = None
            per_example_intent_loss = None
            per_example_slot_loss = None
            if mode != tf.estimator.ModeKeys.PREDICT:
                per_example_intent_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels, logits=intent_logits)
                slot_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=slot_labels, logits=slot_logits)
                per_example_slot_loss = tf.truediv(
                    tf.reduce_sum(slot_loss, axis=1),
                    tf.cast(tf.reduce_sum(labels_mask, axis=1), tf.float32))

                # from tensorflow.contrib.crf import crf_log_likelihood
                # from tensorflow.contrib.crf import viterbi_decode
                # batch_size = tf.shape(slot_logits)[0]
                # print(curLine(), batch_size, tf.constant([self._max_seq_length]))
                # length_batch = tf.tile(tf.constant([self._max_seq_length]), [batch_size])
                # print(curLine(), batch_size, "length_batch:", length_batch)
                # per_example_slot_loss, self.transition_params = crf_log_likelihood(inputs=slot_logits,
                #                 tag_indices=slot_labels,sequence_lengths=length_batch)
                # print(curLine(), "per_example_slot_loss:", per_example_slot_loss) # shape=(batch_size,)
                # print(curLine(), "self.transition_params:", self.transition_params) # shape=(9, 9)

                loss = tf.reduce_mean(self.intent_ratio *
                                      per_example_intent_loss +
                                      self.slot_ratio * per_example_slot_loss)
            pred_intent = tf.cast(tf.argmax(intent_logits, axis=-1), tf.int32)
            pred_slot = tf.cast(tf.argmax(slot_logits, axis=-1), tf.int32)
            return (loss, per_example_slot_loss, pred_intent, pred_slot,
                    batch_size, entity_type_emb, impact_weight_matrix,
                    entity_type_ids_matrix, final_layer, slot_logits)
Beispiel #15
0
    def get_mention_proposal_and_loss(self,
                                      instance,
                                      is_training,
                                      use_tpu=False):
        """
        Desc:
            forward function for training mention proposal module. 
        Args:
            instance: a tuple of train/dev/test data instance. 
                e.g., (flat_input_ids, flat_doc_overlap_input_mask, flat_sentence_map, text_len, speaker_ids, gold_starts, gold_ends, cluster_ids)
            is_training: True/False is in the training process. 
        """
        self.use_tpu = use_tpu
        self.dropout = self.get_dropout(self.config.dropout_rate, is_training)

        flat_input_ids, flat_doc_overlap_input_mask, flat_sentence_map, text_len, speaker_ids, gold_starts, gold_ends, cluster_ids = instance
        # flat_input_ids: (num_window, window_size)
        # flat_doc_overlap_input_mask: (num_window, window_size)
        # flat_sentence_map: (num_window, window_size)
        # text_len: dynamic length and is padded to fix length
        # gold_start: (max_num_mention), mention start index in the original (NON-OVERLAP) document. Pad with -1 to the fix length max_num_mention.
        # gold_end: (max_num_mention), mention end index in the original (NON-OVERLAP) document. Pad with -1 to the fix length max_num_mention.
        # cluster_ids/speaker_ids is not used in the mention proposal model.

        flat_input_ids = tf.math.maximum(
            flat_input_ids,
            tf.zeros_like(flat_input_ids,
                          tf.int32))  # (num_window * window_size)

        flat_doc_overlap_input_mask = tf.where(
            tf.math.greater_equal(flat_doc_overlap_input_mask, 0),
            x=tf.ones_like(flat_doc_overlap_input_mask, tf.int32),
            y=tf.zeros_like(flat_doc_overlap_input_mask,
                            tf.int32))  # (num_window * window_size)
        # flat_doc_overlap_input_mask = tf.math.maximum(flat_doc_overlap_input_mask, tf.zeros_like(flat_doc_overlap_input_mask, tf.int32))
        flat_sentence_map = tf.math.maximum(
            flat_sentence_map,
            tf.zeros_like(flat_sentence_map,
                          tf.int32))  # (num_window * window_size)

        gold_start_end_mask = tf.cast(
            tf.math.greater_equal(gold_starts,
                                  tf.zeros_like(gold_starts, tf.int32)),
            tf.bool)  # (max_num_mention)
        gold_start_index_labels = self.boolean_mask_1d(
            gold_starts,
            gold_start_end_mask,
            name_scope="gold_starts",
            use_tpu=self.use_tpu)  # (num_of_mention)
        gold_end_index_labels = self.boolean_mask_1d(
            gold_ends,
            gold_start_end_mask,
            name_scope="gold_ends",
            use_tpu=self.use_tpu)  # (num_of_mention)

        text_len = tf.math.maximum(text_len, tf.zeros_like(
            text_len, tf.int32))  # (num_of_non_empty_window)
        num_subtoken_in_doc = tf.math.reduce_sum(
            text_len)  # the value should be num_subtoken_in_doc

        input_ids = tf.reshape(
            flat_input_ids,
            [-1, self.config.window_size])  # (num_window, window_size)
        input_mask = tf.ones_like(input_ids,
                                  tf.int32)  # (num_window, window_size)

        model = modeling.BertModel(config=self.bert_config,
                                   is_training=is_training,
                                   input_ids=input_ids,
                                   input_mask=input_mask,
                                   use_one_hot_embeddings=False,
                                   scope='bert')

        doc_overlap_window_embs = model.get_sequence_output(
        )  # (num_window, window_size, hidden_size)
        doc_overlap_input_mask = tf.reshape(
            flat_doc_overlap_input_mask,
            [self.config.num_window, self.config.window_size
             ])  # (num_window, window_size)

        doc_flat_embs = self.transform_overlap_windows_to_original_doc(
            doc_overlap_window_embs, doc_overlap_input_mask)
        doc_flat_embs = tf.reshape(doc_flat_embs,
                                   [-1, self.config.hidden_size
                                    ])  # (num_subtoken_in_doc, hidden_size)

        expand_start_embs = tf.tile(
            tf.expand_dims(doc_flat_embs, 1),
            [1, num_subtoken_in_doc, 1
             ])  # (num_subtoken_in_doc, num_subtoken_in_doc, hidden_size)
        expand_end_embs = tf.tile(
            tf.expand_dims(doc_flat_embs, 0),
            [num_subtoken_in_doc, 1, 1
             ])  # (num_subtoken_in_doc, num_subtoken_in_doc, hidden_size)
        expand_mention_span_embs = tf.concat(
            [expand_start_embs, expand_end_embs], axis=-1
        )  # (num_subtoken_in_doc, num_subtoken_in_doc, 2*hidden_size)
        expand_mention_span_embs = tf.reshape(
            expand_mention_span_embs, [-1, self.config.hidden_size * 2])
        span_sequence_logits = self.ffnn(
            expand_mention_span_embs,
            self.config.hidden_size * 2,
            1,
            dropout=self.dropout,
            name_scope="mention_span"
        )  # (num_subtoken_in_doc * num_subtoken_in_doc)

        if self.config.start_end_share:
            start_end_sequence_logits = self.ffnn(
                doc_flat_embs,
                self.config.hidden_size,
                2,
                dropout=self.dropout,
                name_scope="mention_start_end")  # (num_subtoken_in_doc, 2)
            start_sequence_logits, end_sequence_logits = tf.split(
                start_end_sequence_logits, axis=1)
            # start_sequence_logits -> (num_subtoken_in_doc, 1)
            # end_sequence_logits -> (num_subtoken_in_doc, 1)
        else:
            start_sequence_logits = self.ffnn(
                doc_flat_embs,
                self.config.hidden_size,
                1,
                dropout=self.dropout,
                name_scope="mention_start")  # (num_subtoken_in_doc)
            end_sequence_logits = self.ffnn(
                doc_flat_embs,
                self.config.hidden_size,
                1,
                dropout=self.dropout,
                name_scope="mention_end")  # (num_subtoken_in_doc)

        gold_start_sequence_labels = self.scatter_gold_index_to_label_sequence(
            gold_start_index_labels,
            num_subtoken_in_doc)  # (num_subtoken_in_doc)
        gold_end_sequence_labels = self.scatter_gold_index_to_label_sequence(
            gold_end_index_labels,
            num_subtoken_in_doc)  # (num_subtoken_in_doc)

        start_loss, start_sequence_probabilities = self.compute_score_and_loss(
            start_sequence_logits, gold_start_sequence_labels)
        end_loss, end_sequence_probabilities = self.compute_score_and_loss(
            end_sequence_logits, gold_end_sequence_labels)
        # *_loss -> a scalar
        # *_sequence_scores -> (num_subtoken_in_doc)

        gold_span_sequence_labels = self.scatter_span_sequence_labels(
            gold_start_index_labels, gold_end_index_labels,
            num_subtoken_in_doc)  # (num_subtoken_in_doc * num_subtoken_in_doc)
        span_loss, span_sequence_probabilities = self.compute_score_and_loss(
            span_sequence_logits, gold_span_sequence_labels)
        # span_loss -> a scalar
        # span_sequence_probabilities -> (num_subtoken_in_doc * num_subtoken_in_doc)

        total_loss = self.config.loss_start_ratio * start_loss + self.config.loss_end_ratio * end_loss + self.config.loss_span_ratio * span_loss
        return total_loss, start_sequence_probabilities, end_sequence_probabilities, span_sequence_probabilities
Beispiel #16
0
input_mask = tf.placeholder(shape=[batch_size, max_seq_length],
                            dtype=tf.int32,
                            name="input_mask")
segment_ids = tf.placeholder(shape=[batch_size, max_seq_length],
                             dtype=tf.int32,
                             name="segment_ids")
###
input_labels = tf.placeholder(shape=batch_size,
                              dtype=tf.int32,
                              name="input_ids")
# 创建bert模型
model = modeling.BertModel(
    config=bert_config,
    is_training=True,
    input_ids=input_ids,
    input_mask=input_mask,
    token_type_ids=segment_ids,
    use_one_hot_embeddings=
    False  # 这里如果使用TPU 设置为True,速度会快些。使用CPU 或GPU 设置为False ,速度会快些。
)

output_layer = model.get_sequence_output(
)  # 这个获取每个token的output 输入数据[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个
output_layer = model.get_pooled_output()  # 这个获取句子的output
hidden_size = output_layer.shape[-1].value  #获取输出的维度

# 后面增加一个全连接
with tf.variable_scope('Last_Full'):
    logits = tf.layers.dense(output_layer, 2)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                          labels=input_labels,
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 span_encoding, max_answer_length, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # Get the logits for the start and end predictions.
    final_hidden = model.get_sequence_output()

    final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
    batch_size = final_hidden_shape[0]
    seq_length = final_hidden_shape[1]
    hidden_size = final_hidden_shape[2]

    if span_encoding == "independent":
        output_weights = tf.get_variable(
            "cls/coqa/output_weights", [2, hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))

        output_bias = tf.get_variable("cls/coqa/output_bias", [2],
                                      initializer=tf.zeros_initializer())

        final_hidden_matrix = tf.reshape(
            final_hidden, [batch_size * seq_length, hidden_size])
        logits = tf.matmul(final_hidden_matrix,
                           output_weights,
                           transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)

        logits = tf.reshape(logits, [batch_size, seq_length, 2])
        start_logits, end_logits = tf.unstack(logits, axis=2)
    elif span_encoding == "concat-mlp":
        with tf.variable_scope("coqa"):
            if is_training:
                # The batch size can be variable during inference.
                final_hidden.shape.assert_is_compatible_with(
                    (batch_size, seq_length, hidden_size))
            start_logits = compute_joint_mlp_logits(final_hidden,
                                                    max_answer_length)
            start_logits = mask_joint_logits(input_mask, start_logits)
            end_logits = tf.zeros([batch_size], dtype=tf.float32)  # dummy
    else:
        raise ValueError("Unknown span_encoding: %s" % span_encoding)

    # Get the logits for the answer type prediction.
    # TODO(epitler): Try variants here.
    answer_type_output_layer = model.get_pooled_output()
    answer_type_hidden_size = answer_type_output_layer.shape[-1].value

    num_answer_types = 5  # YES, NO, UNKNOWN, EXTRACTIVE, ABSTRACTIVE
    answer_type_output_weights = tf.get_variable(
        "answer_type_output_weights",
        [num_answer_types, answer_type_hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    answer_type_output_bias = tf.get_variable(
        "answer_type_output_bias", [num_answer_types],
        initializer=tf.zeros_initializer())

    answer_type_logits = tf.matmul(answer_type_output_layer,
                                   answer_type_output_weights,
                                   transpose_b=True)
    answer_type_logits = tf.nn.bias_add(answer_type_logits,
                                        answer_type_output_bias)
    return (start_logits, end_logits, answer_type_logits)
Beispiel #18
0
def create_model(bert_config, is_training, input_ids, input_mask,
                 segment_ids, labels, event_type_mask, trigger_mask, role_mask, num_labels, use_one_hot_embeddings):
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings
    )

    # batch * seq_len * hidden_size
    sent_features = model.get_sequence_output()

    # event type features, mean pooling of each character in the event type word
    event_type_mask = tf.cast(event_type_mask, tf.float32)
    event_type_len = tf.reduce_sum(event_type_mask, axis=-1, keep_dims=True)
    event_type_features = tf.einsum("blh,bl->bh", sent_features, event_type_mask) / event_type_len
    event_type_features = tf.tile(event_type_features[:, None], [1, FLAGS.max_seq_length, 1])

    # role features, mean pooling of each character in the role word
    role_mask = tf.cast(role_mask, tf.float32)
    role_len = tf.reduce_sum(role_mask, axis=-1, keep_dims=True)
    role_features = tf.einsum("blh,bl->bh", sent_features, role_mask) / role_len
    role_features = tf.tile(role_features[:, None], [1, FLAGS.max_seq_length, 1])

    # trigger features, mean pooling of each character in the trigger word
    #trigger_mask = tf.cast(trigger_mask, tf.float32)
    #trigger_len = tf.reduce_sum(trigger_mask, axis=-1, keep_dims=True)
    #trigger_features = tf.einsum("blh,bl->bh", sent_features, trigger_mask) / trigger_len
    #trigger_features = tf.tile(trigger_features[:, None], [1, FLAGS.max_seq_length, 1])

    # final_input = sent_features
    #final_input = tf.concat([sent_features, event_type_features, trigger_features, role_features], axis=-1)
    final_input = tf.concat([sent_features, event_type_features, role_features], axis=-1)

    if FLAGS.add_crf:
        logits = tf.layers.dense(final_input, num_labels,
                                 kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                                 name="dense_layer")
        trans = tf.get_variable(
            "transitions",
            [num_labels, num_labels],
            initializer=initializers.xavier_initializer()
        )
        sequence_lengths = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32)
        log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
            inputs=logits,
            tag_indices=labels,
            transition_params=trans,
            sequence_lengths=sequence_lengths
        )
        loss = tf.reduce_mean(-log_likelihood)

        pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits, transition_params=trans,
                                                sequence_length=sequence_lengths)
        return loss, pred_ids
    elif FLAGS.add_lstm:
        pass
    else:
        valid_label_num = tf.cast(tf.reduce_sum(input_mask), tf.float32)
        input_mask = tf.cast(input_mask, dtype=tf.float32)

        # dense layer
        mlp = tf.layers.dense(final_input, 768//2, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                              name='mlp', activation=tf.nn.tanh)
        # shape of logits, batch * seq_len * num_labels
        logits = tf.layers.dense(mlp, num_labels, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
                                 name="dense_layer")

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits)
        loss *= input_mask

        loss = tf.reduce_sum(loss) / valid_label_num

        probabilities = tf.math.softmax(logits, axis=-1)
        pred_ids = tf.math.argmax(probabilities, axis=-1)
        return loss, pred_ids
Beispiel #19
0
  def get_predictions_and_loss(self, input_ids, input_mask, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids, sentence_map):
    model = modeling.BertModel(
      config=self.bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      use_one_hot_embeddings=False,
      scope='bert')
    all_encoder_layers = model.get_all_encoder_layers()
    mention_doc = model.get_sequence_output() # [batch_size, seq_length, hidden_size]

    self.dropout = self.get_dropout(self.config["dropout_rate"], is_training)

    num_sentences = tf.shape(mention_doc)[0]
    max_sentence_length = tf.shape(mention_doc)[1]
    mention_doc = self.flatten_emb_by_sentence(mention_doc, input_mask) # [num_words, hidden_size]
    num_words = util.shape(mention_doc, 0)
    antecedent_doc = mention_doc


    flattened_sentence_indices = sentence_map
    candidate_starts = tf.tile(tf.expand_dims(tf.range(num_words), 1), [1, self.max_span_width]) # [num_words, max_span_width]
    candidate_ends = candidate_starts + tf.expand_dims(tf.range(self.max_span_width), 0) # [num_words, max_span_width]
    candidate_start_sentence_indices = tf.gather(flattened_sentence_indices, candidate_starts) # [num_words, max_span_width]
    candidate_end_sentence_indices = tf.gather(flattened_sentence_indices, tf.minimum(candidate_ends, num_words - 1)) # [num_words, max_span_width]
    candidate_mask = tf.logical_and(candidate_ends < num_words, tf.equal(candidate_start_sentence_indices, candidate_end_sentence_indices)) # [num_words, max_span_width]
    flattened_candidate_mask = tf.reshape(candidate_mask, [-1]) # [num_words * max_span_width]
    candidate_starts = tf.boolean_mask(tf.reshape(candidate_starts, [-1]), flattened_candidate_mask) # [num_candidates]
    candidate_ends = tf.boolean_mask(tf.reshape(candidate_ends, [-1]), flattened_candidate_mask) # [num_candidates]
    candidate_sentence_indices = tf.boolean_mask(tf.reshape(candidate_start_sentence_indices, [-1]), flattened_candidate_mask) # [num_candidates]

    candidate_cluster_ids = self.get_candidate_labels(candidate_starts, candidate_ends, gold_starts, gold_ends, cluster_ids) # [num_candidates]

    candidate_span_emb = self.get_span_emb(mention_doc, mention_doc, candidate_starts, candidate_ends) # [num_candidates, emb]
    candidate_mention_scores =  self.get_mention_scores(candidate_span_emb, candidate_starts, candidate_ends)
    candidate_mention_scores = tf.squeeze(candidate_mention_scores, 1) # [k]

    # beam size
    k = tf.minimum(3900, tf.to_int32(tf.floor(tf.to_float(num_words) * self.config["top_span_ratio"])))
    c = tf.minimum(self.config["max_top_antecedents"], k)
    # pull from beam
    top_span_indices = coref_ops.extract_spans(tf.expand_dims(candidate_mention_scores, 0),
                                               tf.expand_dims(candidate_starts, 0),
                                               tf.expand_dims(candidate_ends, 0),
                                               tf.expand_dims(k, 0),
                                               num_words,
                                               True) # [1, k]
    top_span_indices.set_shape([1, None])
    top_span_indices = tf.squeeze(top_span_indices, 0) # [k]

    top_span_starts = tf.gather(candidate_starts, top_span_indices) # [k]
    top_span_ends = tf.gather(candidate_ends, top_span_indices) # [k]
    top_span_emb = tf.gather(candidate_span_emb, top_span_indices) # [k, emb]
    top_span_cluster_ids = tf.gather(candidate_cluster_ids, top_span_indices) # [k]
    top_span_mention_scores = tf.gather(candidate_mention_scores, top_span_indices) # [k]
    genre_emb = tf.gather(tf.get_variable("genre_embeddings", [len(self.genres), self.config["feature_size"]], initializer=tf.truncated_normal_initializer(stddev=0.02)),
                          genre) # [emb]
    if self.config['use_metadata']:
      speaker_ids = self.flatten_emb_by_sentence(speaker_ids, input_mask)
      top_span_speaker_ids = tf.gather(speaker_ids, top_span_starts) # [k]i
    else:
        top_span_speaker_ids = None


    dummy_scores = tf.zeros([k, 1]) # [k, 1]
    top_antecedents, top_antecedents_mask, top_fast_antecedent_scores, top_antecedent_offsets = self.coarse_to_fine_pruning(top_span_emb, top_span_mention_scores, c)
    num_segs, seg_len = util.shape(input_ids, 0), util.shape(input_ids, 1)
    word_segments = tf.tile(tf.expand_dims(tf.range(0, num_segs), 1), [1, seg_len])
    flat_word_segments = tf.boolean_mask(tf.reshape(word_segments, [-1]), tf.reshape(input_mask, [-1]))
    mention_segments = tf.expand_dims(tf.gather(flat_word_segments, top_span_starts), 1) # [k, 1]
    antecedent_segments = tf.gather(flat_word_segments, tf.gather(top_span_starts, top_antecedents)) #[k, c]
    segment_distance = tf.clip_by_value(mention_segments - antecedent_segments, 0, self.config['max_training_sentences'] - 1) if self.config['use_segment_distance'] else None #[k, c]
    if self.config['fine_grained']:
      for i in range(self.config["coref_depth"]):
        with tf.variable_scope("coref_layer", reuse=(i > 0)):
          top_antecedent_emb = tf.gather(top_span_emb, top_antecedents) # [k, c, emb]
          top_antecedent_scores = top_fast_antecedent_scores + self.get_slow_antecedent_scores(top_span_emb, top_antecedents, top_antecedent_emb, top_antecedent_offsets, top_span_speaker_ids, genre_emb, segment_distance) # [k, c]
          top_antecedent_weights = tf.nn.softmax(tf.concat([dummy_scores, top_antecedent_scores], 1)) # [k, c + 1]
          top_antecedent_emb = tf.concat([tf.expand_dims(top_span_emb, 1), top_antecedent_emb], 1) # [k, c + 1, emb]
          attended_span_emb = tf.reduce_sum(tf.expand_dims(top_antecedent_weights, 2) * top_antecedent_emb, 1) # [k, emb]
          with tf.variable_scope("f"):
            f = tf.sigmoid(util.projection(tf.concat([top_span_emb, attended_span_emb], 1), util.shape(top_span_emb, -1))) # [k, emb]
            top_span_emb = f * attended_span_emb + (1 - f) * top_span_emb # [k, emb]
    else:
        top_antecedent_scores = top_fast_antecedent_scores

    top_antecedent_scores = tf.concat([dummy_scores, top_antecedent_scores], 1) # [k, c + 1]

    top_antecedent_cluster_ids = tf.gather(top_span_cluster_ids, top_antecedents) # [k, c]
    top_antecedent_cluster_ids += tf.to_int32(tf.log(tf.to_float(top_antecedents_mask))) # [k, c]
    same_cluster_indicator = tf.equal(top_antecedent_cluster_ids, tf.expand_dims(top_span_cluster_ids, 1)) # [k, c]
    non_dummy_indicator = tf.expand_dims(top_span_cluster_ids > 0, 1) # [k, 1]
    pairwise_labels = tf.logical_and(same_cluster_indicator, non_dummy_indicator) # [k, c]
    dummy_labels = tf.logical_not(tf.reduce_any(pairwise_labels, 1, keepdims=True)) # [k, 1]
    top_antecedent_labels = tf.concat([dummy_labels, pairwise_labels], 1) # [k, c + 1]
    loss = self.softmax_loss(top_antecedent_scores, top_antecedent_labels) # [k]
    loss = tf.reduce_sum(loss) # []

    return [candidate_starts, candidate_ends, candidate_mention_scores, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores], loss
def create_model(is_training=True, ):
    # region 模型超参数
    # is_training         = True
    # batch_size          = 256
    batch_size = 256
    max_seq_len = 256
    num_classes = 2

    #  创建bert的输入
    input_ids = tf.placeholder(shape=[None, max_seq_len],
                               dtype=tf.int32,
                               name="input_ids")
    input_mask = tf.placeholder(shape=[None, max_seq_len],
                                dtype=tf.int32,
                                name="input_mask")
    segment_ids = tf.placeholder(shape=[None, max_seq_len],
                                 dtype=tf.int32,
                                 name="segment_ids")
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    # learning_rate   = tf.placeholder(tf.float32, name='learning_rate')
    learning_rate = 0.01
    num_train_steps = tf.placeholder(tf.float32, name='num_train_steps')
    # num_train_steps = tf.placeholder(tf.int32, name='num_train_steps')
    ###
    input_labels = tf.placeholder(shape=[
        None,
    ],
                                  dtype=tf.int32,
                                  name="input_labels")
    # 创建bert模型
    model = modeling.BertModel(
        config=BERT_CONFIG,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=
        False  # 这里如果使用TPU 设置为True,速度会快些。使用CPU 或GPU 设置为False ,速度会快些。
    )

    output_layer = model.get_pooled_output()  # 这个获取句子的output
    hidden_size = output_layer.shape[-1].value  # 获取输出的维度

    # 定义全连接层
    with tf.variable_scope("fc1"):
        output_layer = tf.nn.dropout(output_layer, keep_prob=keep_prob)

        fc1 = tf.get_variable(shape=[num_classes, hidden_size],
                              dtype=tf.float32,
                              initializer=tf.initializers.he_normal(),
                              name="fc1")
        bias1 = tf.Variable(tf.zeros(shape=[
            num_classes,
        ]), name='bias1')
        fc1 = tf.matmul(output_layer, fc1, transpose_b=True) + bias1

        # 分类器
        y_pred_cls = tf.argmax(tf.nn.softmax(fc1), 1, name='y_pred')  # 预测类别

    with tf.variable_scope("optimize"):
        # 将label进行onehot转化.
        one_hot_labels = tf.one_hot(input_labels, depth=2, dtype=tf.float32)
        # 损失函数,交叉熵
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            logits=fc1, labels=one_hot_labels)
        loss = tf.reduce_mean(cross_entropy)
        # 优化器
        # train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

        train_op = optimization.create_optimizer(
            loss,
            init_lr=learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=None,
            use_tpu=False)

    with tf.variable_scope("accuracy"):
        correct_pred = tf.equal(input_labels,
                                tf.cast(y_pred_cls, dtype=tf.int32))
        accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32),
                                  name="accuracy")

    with tf.name_scope("summary"):
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("accuracy", accuracy)
        merged_summary = tf.summary.merge_all()

    # output_layer = model.get_sequence_output()  # 这个获取每个token的output 输入数据[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个

    # partial_init = tf.initializers.variables([filters, fc1, bias1, fc2, bias2], name='partial_init')

    inputParams = {
        'input_ids': input_ids,
        'input_mask': input_mask,
        'segment_ids': segment_ids,
        'input_labels': input_labels,
        'keep_prob': keep_prob,
        # 'learning_rate':learning_rate,
        'num_train_steps': num_train_steps
    }

    outputParams = {
        'loss': loss,
        'y_pred_cls': y_pred_cls,
        'accuracy': accuracy,
        'train_op': train_op,
    }

    summaryParams = {'merged_summary': merged_summary}

    return inputParams, outputParams, summaryParams
Beispiel #21
0
def create_model(bert_config, input_ids, input_masks, segment_ids,
                 token_label_ids, sent_label_ids, token_label_list,
                 sent_label_list, mode, use_tpu):
    """Creates a NLU model."""
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_masks,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_tpu)

    # If you want to use sentence-level output, use model.get_pooled_output()
    # If you want to use token-level output, use model.get_sequence_output()
    with tf.variable_scope("token", reuse=tf.AUTO_REUSE):
        token_result = model.get_sequence_output()
        token_result_mask = tf.cast(tf.expand_dims(input_masks, axis=-1),
                                    dtype=tf.float32)

        token_kernel_initializer = tf.glorot_uniform_initializer(
            seed=np.random.randint(10000), dtype=tf.float32)
        token_bias_initializer = tf.zeros_initializer
        token_dense_layer = tf.keras.layers.Dense(
            units=len(token_label_list),
            activation=None,
            use_bias=True,
            kernel_initializer=token_kernel_initializer,
            bias_initializer=token_bias_initializer,
            kernel_regularizer=None,
            bias_regularizer=None,
            trainable=True)

        token_dropout_layer = tf.keras.layers.Dropout(
            rate=0.1, seed=np.random.randint(10000))

        token_result = token_dense_layer(token_result)
        if mode == tf.estimator.ModeKeys.TRAIN:
            token_result = token_dropout_layer(token_result)

        masked_token_predict = token_result * token_result_mask + MIN_FLOAT * (
            1 - token_result_mask)
        token_predict_ids = tf.cast(tf.argmax(tf.nn.softmax(
            masked_token_predict, axis=-1),
                                              axis=-1),
                                    dtype=tf.int32)

    with tf.variable_scope("sent", reuse=tf.AUTO_REUSE):
        sent_result = model.get_pooled_output()
        sent_result_mask = tf.cast(tf.reduce_max(input_masks,
                                                 axis=-1,
                                                 keepdims=True),
                                   dtype=tf.float32)

        sent_kernel_initializer = tf.glorot_uniform_initializer(
            seed=np.random.randint(10000), dtype=tf.float32)
        sent_bias_initializer = tf.zeros_initializer
        sent_dense_layer = tf.keras.layers.Dense(
            units=len(sent_label_list),
            activation=None,
            use_bias=True,
            kernel_initializer=sent_kernel_initializer,
            bias_initializer=sent_bias_initializer,
            kernel_regularizer=None,
            bias_regularizer=None,
            trainable=True)

        sent_dropout_layer = tf.keras.layers.Dropout(
            rate=0.1, seed=np.random.randint(10000))

        sent_result = sent_dense_layer(sent_result)
        if mode == tf.estimator.ModeKeys.TRAIN:
            sent_result = sent_dropout_layer(sent_result)

        masked_sent_predict = sent_result * sent_result_mask + MIN_FLOAT * (
            1 - sent_result_mask)
        sent_predict_ids = tf.cast(tf.argmax(tf.nn.softmax(masked_sent_predict,
                                                           axis=-1),
                                             axis=-1),
                                   dtype=tf.int32)

    loss = tf.constant(0.0, dtype=tf.float32)
    if mode not in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
        return loss, token_predict_ids, sent_predict_ids

    if token_label_ids is not None:
        with tf.variable_scope("token_loss", reuse=tf.AUTO_REUSE):
            token_label = tf.cast(token_label_ids, dtype=tf.float32)
            token_label_mask = tf.cast(input_masks, dtype=tf.float32)
            masked_token_label = tf.cast(token_label * token_label_mask,
                                         dtype=tf.int32)
            token_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=masked_token_label, logits=masked_token_predict)
            token_loss = tf.reduce_sum(
                token_cross_entropy * token_label_mask) / tf.reduce_sum(
                    tf.reduce_max(token_label_mask, axis=-1))
            loss = loss + token_loss

    if sent_label_ids is not None:
        with tf.variable_scope("sent_loss", reuse=tf.AUTO_REUSE):
            sent_label = tf.cast(sent_label_ids, dtype=tf.float32)
            sent_label_mask = tf.cast(tf.reduce_max(input_masks, axis=-1),
                                      dtype=tf.float32)
            masked_sent_label = tf.cast(sent_label * sent_label_mask,
                                        dtype=tf.int32)
            sent_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=masked_sent_label, logits=masked_sent_predict)
            sent_loss = tf.reduce_sum(
                sent_cross_entropy * sent_label_mask) / tf.reduce_sum(
                    tf.reduce_max(sent_label_mask, axis=-1))
            loss = loss + sent_loss

    return loss, token_predict_ids, sent_predict_ids
def create_graph(graph_file,
                 bert_config_file,
                 init_checkpoint,
                 max_seq_len,
                 select_layers,
                 output_dir='../bert/tmp'):
    #tf.reset_default_graph()
    #from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
    tf.gfile.MakeDirs(output_dir)

    bert_config = modeling.BertConfig.from_json_file(bert_config_file)

    input_ids = tf.placeholder(tf.int32, (None, max_seq_len), 'input_ids')
    input_mask = tf.placeholder(tf.int32, (None, max_seq_len), 'input_mask')
    input_type_ids = tf.placeholder(tf.int32, (None, max_seq_len),
                                    'input_type_ids')

    input_tensors = [input_ids, input_mask, input_type_ids]

    model = modeling.BertModel(config=bert_config,
                               is_training=False,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=input_type_ids,
                               use_one_hot_embeddings=False)

    tvars = tf.trainable_variables()
    (assignment_map,
     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, init_checkpoint)

    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    all_layers = []
    if len(select_layers) == 1:
        encoder_layer = model.all_encoder_layers[select_layers[0]]
    else:
        for layer in select_layers:
            all_layers.append(model.all_encoder_layers[layer])
        encoder_layer = tf.concat(all_layers, -1)

    #output_tensors = [encoder_layer]
    pooled = tf.identity(encoder_layer, 'final_encodes')
    output_tensors = [pooled]

    tmp_g = tf.get_default_graph().as_graph_def()

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        tmp_g = tf.graph_util.convert_variables_to_constants(
            sess, tmp_g, [n.name[:-2] for n in output_tensors])
        #[print(n.name) for n in output_tensors]
        dtypes = [n.dtype for n in input_tensors]
        #[print(n.name) for n in input_tensors]
        tmp_g = optimize_for_inference(
            tmp_g, [n.name[:-2] for n in input_tensors],
            [n.name[:-2] for n in output_tensors],
            [dtype.as_datatype_enum for dtype in dtypes], False)
    tmp_file = graph_file
    with tf.gfile.GFile(tmp_file, 'wb') as f:
        f.write(tmp_g.SerializeToString())
    return tmp_file
Beispiel #23
0
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = tf.reshape(features["input_ids"], [-1, FLAGS.max_seq_length])
    input_mask = tf.reshape(features["input_mask"], [-1, FLAGS.max_seq_length])
    segment_ids = tf.reshape(features["segment_ids"],
                             [-1, FLAGS.max_seq_length])

    label_types = features["label_types"]
    label_ids = features["label_ids"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    is_real_example = tf.reduce_sum(
        tf.one_hot(label_types, FLAGS.k_size * 2), axis=1)

    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    (cpc_loss, _, logits, probabilities) = bilin_model_builder.create_model(
        model, label_ids, label_types, num_choices, k_size=FLAGS.k_size)

    if add_masking:
      mask_rate = FLAGS.mask_rate  # search alternatives?
      max_predictions_per_seq = int(math.ceil(FLAGS.max_seq_length * mask_rate))
      masked_lm_positions = tf.reshape(features["mask_indices"],
                                       [-1, max_predictions_per_seq])
      masked_lm_ids = tf.reshape(features["target_token_ids"],
                                 [-1, max_predictions_per_seq])
      masked_lm_weights = tf.reshape(features["target_token_weights"],
                                     [-1, max_predictions_per_seq])
      (masked_lm_loss, _, _) = bilin_model_builder.get_masked_lm_output(
          bert_config, model.get_sequence_output(), model.get_embedding_table(),
          masked_lm_positions, masked_lm_ids, masked_lm_weights)
      total_loss = cpc_loss + masked_lm_loss
    else:
      total_loss = cpc_loss
      masked_lm_loss = tf.constant([0])

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)

      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      train_op = optimization.create_optimizer(total_loss, learning_rate,
                                               num_train_steps,
                                               num_warmup_steps, use_tpu)

      output_spec = contrib_tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)

    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(cpc_loss, mlm_loss, label_ids, logits, is_real_example):
        """Collect metrics for function."""

        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        accuracy = tf.metrics.accuracy(
            labels=label_ids, predictions=predictions, weights=is_real_example)
        cpc_loss_metric = tf.metrics.mean(values=cpc_loss)
        mlm_loss_metric = tf.metrics.mean(values=mlm_loss)
        metric_dict = {
            "eval_accuracy": accuracy,
            "eval_cpc_loss": cpc_loss_metric,
            "eval_mlm_loss": mlm_loss_metric
        }
        for i in range(FLAGS.k_size * 2):
          metric_dict["acc" + str(i)] = tf.metrics.accuracy(
              labels=label_ids[:, i],
              predictions=predictions[:, i],
              weights=is_real_example[:, i])
        return metric_dict

      eval_metrics = (metric_fn, [
          cpc_loss, masked_lm_loss, label_ids, logits, is_real_example
      ])
      output_spec = contrib_tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      output_spec = contrib_tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"probabilities": probabilities},
          scaffold_fn=scaffold_fn)
    return output_spec
def create_model(bert_config, is_training, input_ids, input_mask, P_mask,
                 A_mask, B_mask, segment_ids, labels, num_labels,
                 use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    all_out = model.get_sequence_output()
    hidden_size = all_out.shape[-1].value
    '''The next 3 lines of code work on GPU for boolean masking
  However, boolean_mask() is not implemented for TPU because it
  results in dynamic tensor sizes.
  P = tf.boolean_mask(all_out, P_mask)
  A = tf.boolean_mask(all_out, A_mask)
  B = tf.boolean_mask(all_out, B_mask)'''

    #The next 15 lines of code are a TPU workaround
    #Messy but works. There may be a better way.
    _P_mask = tf.cast(P_mask, tf.float32)
    _A_mask = tf.cast(A_mask, tf.float32)
    _B_mask = tf.cast(B_mask, tf.float32)

    _P_mask_ = tf.broadcast_to(_P_mask,
                               shape=(tf.shape(all_out)[2],
                                      tf.shape(all_out)[0],
                                      tf.shape(all_out)[1]))
    P_mask_ = tf.transpose(_P_mask_, perm=[1, 2, 0])
    _A_mask_ = tf.broadcast_to(_A_mask,
                               shape=(tf.shape(all_out)[2],
                                      tf.shape(all_out)[0],
                                      tf.shape(all_out)[1]))
    A_mask_ = tf.transpose(_A_mask_, perm=[1, 2, 0])
    _B_mask_ = tf.broadcast_to(_B_mask,
                               shape=(tf.shape(all_out)[2],
                                      tf.shape(all_out)[0],
                                      tf.shape(all_out)[1]))
    B_mask_ = tf.transpose(_B_mask_, perm=[1, 2, 0])

    P_ = tf.multiply(all_out, P_mask_)
    P = tf.reduce_sum(P_, axis=1)
    A_ = tf.multiply(all_out, A_mask_)
    A = tf.reduce_sum(A_, axis=1)
    B_ = tf.multiply(all_out, B_mask_)
    B = tf.reduce_sum(B_, axis=1)
    #End of TPU workaround

    PA = tf.multiply(P, A)
    PB = tf.multiply(P, B)
    PP = tf.multiply(P, P)
    AB = tf.multiply(A, B)
    N = tf.subtract(PP, AB)

    AB_weights = tf.get_variable(
        "AB_weights", [1, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    N_weights = tf.get_variable(
        "N_weights", [1, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    if is_training:
        # I'm not sure if dropout on weights, rather than data
        # is accepted practice, but it seemed to work
        AB_weights = tf.nn.dropout(AB_weights, keep_prob=0.9)
        N_weights = tf.nn.dropout(N_weights, keep_prob=0.9)

    A_out = tf.matmul(PA, AB_weights, transpose_b=True)
    B_out = tf.matmul(PB, AB_weights, transpose_b=True)
    N_out = tf.matmul(N, N_weights, transpose_b=True)

    output_bias = tf.get_variable("output_bias", [num_labels],
                                  initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        logits = tf.concat([A_out, B_out, N_out], axis=1)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, per_example_loss, logits, probabilities)
Beispiel #25
0
    def __init__(
            self,
            bert_config,
            char_config,
            is_training,  # is_evaluation,
            input_token_ids,
            input_char_ids,
            labels,
            num_labels,
            use_char_representation=True,
            input_mask=None,
            segment_ids=None,
            use_one_hot_embeddings=False,  # TPU加速则为True
            scope=None):
        """

        :param bert_config:
        :param char_config:
        :param is_training: 处于estimator模式下的train模式
        :param is_evaluation: 处于estimator模式下的evaluate模式
        :param input_token_ids:
        :param input_char_ids:
        :param labels: 真实标签
        :param num_labels: 标签个数,用于CRF的转移矩阵
        :param input_mask:
        :param segment_ids: 用于Bert,不过这里没啥用处,因为只是处理一个ner的问题,所以bert默认都为0
        :param use_one_hot_embeddings: 是否用tpu
        :param scope:
        """
        self.bert_model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_token_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)
        self.token_output = self.bert_model.get_sequence_output()

        if use_char_representation:
            char_embed_dim = char_config['char_embed_dim']
            filters = char_config['filters']
            alphabet_size = char_config['alphabet_size']
            activations = char_config['activations']
            n_highway = char_config['n_highway']
            projection_dim = char_config['projection_dim']
            char_dropout_rate = char_config[
                'char_dropout_rate'] if is_training else 1.0

            self.charcnn_model = CharRepresentation(
                char_input=input_char_ids,
                alphabet_size=alphabet_size,
                filters=filters,
                projection_dim=projection_dim,
                char_embed_dim=char_embed_dim,
                activations=activations,
                n_highway=n_highway,
                dropout_rate=char_dropout_rate)
            self.char_output = self.charcnn_model.get_highway_output()

            token_shape = modeling.get_shape_list(self.token_output,
                                                  expected_rank=3)
            char_shape = modeling.get_shape_list(self.char_output,
                                                 expected_rank=3)

            if token_shape[1] != char_shape[1]:
                raise ValueError(
                    "The time steps of token representation (%d) is not the same as char representation (%d) "
                    % (token_shape[1], char_shape[1]))

            self.final_output = tf.concat(
                [self.token_output, self.char_output], axis=-1)
        else:
            tf.logging.info(
                "****************BERT representation only***************")
            self.final_output = self.token_output

        sequece_lengths = tf.reduce_sum(input_mask, axis=-1)
        self.crf = CRF(
            input=self.final_output,
            labels=labels,
            num_labels=num_labels,
            lengths=sequece_lengths,
            is_training=is_training,
            # is_evaluation=is_evaluation  # estimator模式下的evaluate模式还是需要返回损失函数的
        )
Beispiel #26
0
# -*- coding: utf-8 -*-
"""
@Time    : 2021/6/1 14:43
@Author  : huangkai21
@file    : bert_web.py
"""
import tensorflow as tf
from bert import modeling
import os
import collections
import six
from gevent import monkey
monkey.patch_all()
from flask import Flask, request
from gevent import pywsgi
import numpy as np
import json
flags = tf.flags
FLAGS = flags.FLAGS
bert_path = r'E:\code\chinese_L-12_H-768_A-12/'
flags.DEFINE_string(
    "bert_config_file", os.path.join(bert_path, 'bert_config.json'),
    "The config json file corresponding to the pre-trained BERT model.")
flags.DEFINE_string("bert_vocab_file", os.path.join(bert_path, 'vocab.txt'),
                    "The config vocab file")
flags.DEFINE_string(
    "init_checkpoint", os.path.join(bert_path, 'bert_model.ckpt'),
    "Initial checkpoint (usually from a pre-trained BERT model).")
app = Flask(__name__)
Beispiel #27
0
    def cnn(self):
        '''Get the final token-level output of BERT model using get_sequence_output function, and use it as the input embeddings of CNN model.
        '''
        with tf.name_scope('bert'):
            bert_model = modeling.BertModel(
                config=self.bert_config,
                is_training=self.config.is_training,
                input_ids=self.input_ids,
                input_mask=self.input_mask,
                token_type_ids=self.segment_ids,
                use_one_hot_embeddings=self.config.use_one_hot_embeddings)
            embedding_inputs = bert_model.get_sequence_output()
        '''Use three convolution kernels to do convolution and pooling, and concat the three resutls.'''
        with tf.name_scope('conv'):
            pooled_outputs = []
            for i, filter_size in enumerate(self.config.filter_sizes):
                with tf.compat.v1.variable_scope("conv-maxpool-%s" %
                                                 filter_size,
                                                 reuse=False):
                    conv = tf.layers.conv1d(embedding_inputs,
                                            self.config.num_filters,
                                            filter_size,
                                            name='conv1d')
                    pooled = tf.reduce_max(conv,
                                           reduction_indices=[1],
                                           name='gmp')
                    pooled_outputs.append(pooled)

            num_filters_total = self.config.num_filters * len(
                self.config.filter_sizes)
            h_pool = tf.concat(pooled_outputs, 1)
            outputs = tf.reshape(h_pool, [-1, num_filters_total])
        '''Add full connection layer and dropout layer'''
        with tf.name_scope('fc'):
            fc = tf.layers.dense(outputs, self.config.hidden_dim, name='fc1')
            fc = tf.nn.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)
        '''logits'''
        with tf.name_scope('logits'):
            self.logits = tf.layers.dense(fc,
                                          self.config.num_labels,
                                          name='logits')
            self.prob = tf.nn.softmax(self.logits)
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)
        '''Calculate loss. Convert predicted labels into one hot form. '''
        with tf.name_scope('loss'):
            log_probs = tf.nn.log_softmax(self.logits, axis=-1)
            one_hot_labels = tf.one_hot(self.labels,
                                        depth=self.config.num_labels,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            self.loss = tf.reduce_mean(per_example_loss)
        '''optimizer'''
        with tf.name_scope('optimizer'):
            optimizer = tf.compat.v1.train.AdamOptimizer(self.config.lr)
            gradients, variables = zip(*optimizer.compute_gradients(self.loss))
            gradients, _ = tf.clip_by_global_norm(gradients, self.config.clip)
            self.optim = optimizer.apply_gradients(
                zip(gradients, variables), global_step=self.global_step)
        '''accuracy'''
        with tf.name_scope('accuracy'):
            correct_pred = tf.equal(self.labels, self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
Beispiel #28
0
def model_fn(features, labels, mode, params):
    bert_config = modeling.BertConfig.from_json_file(
        'bert-base-2020-03-19/bert_config.json')

    X = features['X']
    input_masks = features['mask']

    X_b = features['X_b']
    input_masks_b = features['mask_b']

    Y = features['label'][:, 0]

    with tf.compat.v1.variable_scope('bert', reuse=False):
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=X,
            input_mask=input_masks,
            use_one_hot_embeddings=False,
        )

        summary = model.get_pooled_output()

    with tf.compat.v1.variable_scope('bert', reuse=True):
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=X_b,
            input_mask=input_masks_b,
            use_one_hot_embeddings=False,
        )

        summary_b = model.get_pooled_output()

    vectors_concat = [summary, summary_b, tf.abs(summary - summary_b)]
    vectors_concat = tf.concat(vectors_concat, axis=1)
    logits = tf.layers.dense(vectors_concat, 2)

    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=Y))
    tf.identity(loss, 'train_loss')

    accuracy = tf.metrics.accuracy(labels=Y,
                                   predictions=tf.argmax(logits, axis=1))
    tf.identity(accuracy[1], name='train_accuracy')

    tvars = tf.trainable_variables()
    init_checkpoint = 'bert-base-2020-03-19/model.ckpt-2000002'
    assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(
        tvars, init_checkpoint)
    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = optimization.create_optimizer(loss, learning_rate,
                                                 num_train_steps,
                                                 num_warmup_steps, False)
        estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=loss,
                                                    train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL,
            loss=loss,
            eval_metric_ops={'accuracy': accuracy},
        )

    return estimator_spec
Beispiel #29
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        num_choices = 2

        read_size = num_choices + 1
        input_ids = [
            features["input_ids" + str(i)] for i in range(0, read_size)
        ]
        input_mask = [
            features["input_mask" + str(i)] for i in range(0, read_size)
        ]
        segment_ids = [
            features["segment_ids" + str(i)] for i in range(0, read_size)
        ]
        label_ids = features["labels"]
        label_ids = label_ids[:, 4]

        seq_length = input_ids[0].shape[-1]
        input_ids = tf.reshape(tf.stack(input_ids, axis=1), [-1, seq_length])
        input_mask = tf.reshape(tf.stack(input_mask, axis=1), [-1, seq_length])
        segment_ids = tf.reshape(tf.stack(segment_ids, axis=1),
                                 [-1, seq_length])

        is_training = (mode == tf_estimator.ModeKeys.TRAIN)

        model = modeling.BertModel(
            config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        if FLAGS.bilin_preproc:
            (total_loss, per_example_loss, logits,
             probabilities) = model_builder.create_model_bilin(
                 model, label_ids, num_choices)
        else:
            (total_loss, per_example_loss, logits,
             probabilities) = model_builder.create_model(
                 model, label_ids, num_choices)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf_estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, use_tpu)

            output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                       loss=total_loss,
                                                       train_op=train_op,
                                                       scaffold_fn=scaffold_fn)

        elif mode == tf_estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, logits):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.accuracy(labels=label_ids,
                                               predictions=predictions)
                loss = tf.metrics.mean(values=per_example_loss)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
            output_spec = contrib_tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            output_spec = contrib_tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold_fn=scaffold_fn)
        return output_spec
Beispiel #30
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 token_label_ids, predicate_matrix_ids, num_token_labels,
                 num_predicate_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)

    # We "pool" the model by simply taking the hidden state corresponding
    # to the first token. float Tensor of shape [batch_size, hidden_size]
    # model_pooled_output = model.get_pooled_output()

    #     """Gets final hidden layer of encoder.
    #
    #     Returns:
    #       float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
    #       to the final hidden of the transformer encoder.
    #     """
    sequence_bert_encode_output = model.get_sequence_output()
    if is_training:
        sequence_bert_encode_output = tf.nn.dropout(
            sequence_bert_encode_output, keep_prob=0.9)

    with tf.variable_scope("predicate_head_select_loss"):
        bert_sequenc_length = sequence_bert_encode_output.shape[-2].value
        # shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers]
        predicate_score_matrix = getHeadSelectionScores(
            encode_input=sequence_bert_encode_output,
            hidden_size_n1=100,
            label_number=num_predicate_labels)
        predicate_head_probabilities = tf.nn.sigmoid(predicate_score_matrix)
        # predicate_head_prediction = tf.argmax(predicate_head_probabilities, axis=3)
        predicate_head_predictions_round = tf.round(
            predicate_head_probabilities)
        predicate_head_predictions = tf.cast(predicate_head_predictions_round,
                                             tf.int32)
        # shape [batch_size, sequence_length, sequencd_length]
        predicate_matrix = tf.reshape(
            predicate_matrix_ids,
            [-1, bert_sequenc_length, bert_sequenc_length])
        gold_predicate_matrix_one_hot = tf.one_hot(predicate_matrix,
                                                   depth=num_predicate_labels,
                                                   dtype=tf.float32)
        # shape [batch_size, sequence_length, sequencd_length, predicate_label_numbers]
        predicate_sigmoid_cross_entropy_with_logits = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=predicate_score_matrix,
            labels=gold_predicate_matrix_one_hot)

        def batch_sequence_matrix_max_sequence_length(batch_sequence_matrix):
            """Get the longest effective length of the input sequence (excluding padding)"""
            mask = tf.math.logical_not(tf.math.equal(batch_sequence_matrix, 0))
            mask = tf.cast(mask, tf.float32)
            mask_length = tf.reduce_sum(mask, axis=1)
            mask_length = tf.cast(mask_length, tf.int32)
            mask_max_length = tf.reduce_max(mask_length)
            return mask_max_length

        mask_max_length = batch_sequence_matrix_max_sequence_length(
            token_label_ids)

        predicate_sigmoid_cross_entropy_with_logits = predicate_sigmoid_cross_entropy_with_logits[:, :
                                                                                                  mask_max_length, :
                                                                                                  mask_max_length, :]
        # shape []
        predicate_head_select_loss = tf.reduce_sum(
            predicate_sigmoid_cross_entropy_with_logits)

    with tf.variable_scope("token_label_loss"):
        bert_encode_hidden_size = sequence_bert_encode_output.shape[-1].value
        token_label_output_weight = tf.get_variable(
            "token_label_output_weights",
            [num_token_labels, bert_encode_hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))
        token_label_output_bias = tf.get_variable(
            "token_label_output_bias", [num_token_labels],
            initializer=tf.zeros_initializer())
        sequence_bert_encode_output = tf.reshape(sequence_bert_encode_output,
                                                 [-1, bert_encode_hidden_size])
        token_label_logits = tf.matmul(sequence_bert_encode_output,
                                       token_label_output_weight,
                                       transpose_b=True)
        token_label_logits = tf.nn.bias_add(token_label_logits,
                                            token_label_output_bias)

        token_label_logits = tf.reshape(
            token_label_logits, [-1, FLAGS.max_seq_length, num_token_labels])
        token_label_log_probs = tf.nn.log_softmax(token_label_logits, axis=-1)

        token_label_one_hot_labels = tf.one_hot(token_label_ids,
                                                depth=num_token_labels,
                                                dtype=tf.float32)
        token_label_per_example_loss = -tf.reduce_sum(
            token_label_one_hot_labels * token_label_log_probs, axis=-1)
        token_label_loss = tf.reduce_sum(token_label_per_example_loss)
        token_label_probabilities = tf.nn.softmax(token_label_logits, axis=-1)
        token_label_predictions = tf.argmax(token_label_probabilities, axis=-1)
        # return (token_label_loss, token_label_per_example_loss, token_label_logits, token_label_predict)

    loss = predicate_head_select_loss + token_label_loss
    return (loss, predicate_head_select_loss, predicate_head_probabilities,
            predicate_head_predictions, token_label_loss,
            token_label_per_example_loss, token_label_logits,
            token_label_predictions)