Ejemplo n.º 1
0
def create_model(bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 num_labels,
                 use_one_hot_embeddings,
                 dropout_rate=1.0,
                 lstm_size=1,
                 cell='lstm',
                 num_layers=1):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    import tensorflow as tf
    from bert_base.bert import modeling
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    # embedding Tensor("bert/encoder/Reshape_13:0", shape=(56, 202, 768), dtype=float32)
    embedding = model.get_sequence_output()
    # max_seq_length 202
    max_seq_length = embedding.shape[1].value
    # 算序列真实长度
    # input_ids Tensor("IteratorGetNext:0", shape=(56, 202), dtype=int32)
    # tf.abs Tensor("Abs_1:0", shape=(56, 202), dtype=int32)
    # used Tensor("Sign:0", shape=(56, 202), dtype=int32)
    # lengths Tensor("Sum:0", shape=(56,), dtype=int32)  1行56列(/56行1列?)的数组   每个batch下序列的真实长度
    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度
    # 添加CRF output layer
    blstm_crf = BLSTM_CRF(embedded_chars=embedding,
                          hidden_unit=lstm_size,
                          cell_type=cell,
                          num_layers=num_layers,
                          dropout_rate=dropout_rate,
                          initializers=initializers,
                          num_labels=num_labels,
                          seq_length=max_seq_length,
                          labels=labels,
                          lengths=lengths,
                          is_training=is_training)
    rst = blstm_crf.add_blstm_crf_layer(crf_only=False)
    return rst
Ejemplo n.º 2
0
def create_model(bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 num_labels,
                 use_one_hot_embeddings,
                 dropout_rate=1.0,
                 lstm_size=1,
                 cell='lstm',
                 num_layers=1):
    """
    创建模型
    :param bert_config: bert配置
    :param is_training: 是否训练
    :param input_ids: 数据的idx表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :param dropout_rate:
    :param lstm_size:
    :param cell:
    :param num_layers:
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # 获取对应的embedding,输入数据[batch_size, seq_length, embedding_size]
    embedding = model.get_sequence_output()
    max_seq_length = embedding.shape[1].value

    # 序列的真实长度
    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size]大小的向量,包含了当前的batch中的序列长度

    # 添加CRF output layer
    blstm_crf = BLSTM_CRF(embedd_chars=embedding,
                          hidden_unit=lstm_size,
                          cell_type=cell,
                          num_layers=num_layers,
                          dropout_rate=dropout_rate,
                          initializers=initializers,
                          num_labels=num_labels,
                          seq_length=max_seq_length,
                          labels=labels,
                          lengths=lengths,
                          is_training=is_training)

    res = blstm_crf.add_blstm_crf_layer(crf_only=True)
    return res
Ejemplo n.º 3
0
def create_model(bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 num_labels,
                 use_one_hot_embeddings,
                 dropout_rate=1.0,
                 lstm_size=1,
                 cell='lstm',
                 num_layers=1):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    import tensorflow as tf
    from bert_base.bert import modeling
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    label_vocab = {"O": 0, "B": 1, "I": 2, "X": 3, "[CLS]": 4, "[SEP]": 5}
    SPAN_TYPE = "IOB2"
    mask = transition_mask(label_vocab, SPAN_TYPE, label_vocab["[CLS]"],
                           label_vocab["[SEP]"])

    embedding = model.get_sequence_output()
    max_seq_length = embedding.shape[1].value
    # 算序列真实长度
    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度
    # 添加CRF output layer
    blstm_crf = BLSTM_CRF(embedded_chars=embedding,
                          hidden_unit=lstm_size,
                          cell_type=cell,
                          num_layers=num_layers,
                          dropout_rate=dropout_rate,
                          initializers=initializers,
                          num_labels=num_labels,
                          seq_length=max_seq_length,
                          labels=labels,
                          lengths=lengths,
                          is_training=is_training)
    rst = blstm_crf.add_blstm_crf_layer(crf_only=True)
    return rst
Ejemplo n.º 4
0
def create_model(bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 num_labels,
                 use_one_hot_embeddings,
                 dropout_rate=1.0,
                 lstm_size=1,
                 cell='lstm',
                 num_layers=1,
                 crf_only=False,
                 lstm_only=False):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    import tensorflow as tf
    from bert_base.bert import modeling
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    embedding = model.get_sequence_output()
    max_seq_length = embedding.shape[1].value
    # 算序列真实长度
    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度
    # 添加CRF output layer
    with tf.variable_scope('finetune'):
        blstm_crf = BLSTM_CRF(embedded_chars=embedding,
                              input_mask=input_mask,
                              hidden_unit=lstm_size,
                              cell_type=cell,
                              num_layers=num_layers,
                              dropout_rate=dropout_rate,
                              initializers=initializers,
                              num_labels=num_labels,
                              seq_length=max_seq_length,
                              labels=labels,
                              lengths=lengths,
                              is_training=is_training)
        rst = blstm_crf.add_blstm_crf_layer(crf_only=crf_only,
                                            lstm_only=lstm_only)
    return rst
Ejemplo n.º 5
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    print("获取到bert的输出")
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    embedding = model.get_sequence_output()
    max_seq_length = embedding.shape[1].value

    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度
    # 将embedding作为LSTM结构的输入
    # 加载BLSTM-CRF模型对象
    blstm_crf = BLSTM_CRF(embedded_chars=embedding,
                          hidden_unit=FLAGS.lstm_size,
                          cell_type=FLAGS.cell,
                          num_layers=FLAGS.num_layers,
                          dropout_rate=FLAGS.dropout_rate,
                          initializers=initializers,
                          num_labels=num_labels,
                          seq_length=max_seq_length,
                          labels=labels,
                          lengths=lengths,
                          is_training=is_training)
    # 获取添加我们自己网络结构后的结果,这些结果包括loss, logits, trans, pred_ids
    # 因为BERT里面已经存在双向编码,所以LSTM并不是必须的,可以将BERT最后一层的结构直接扔给CRF进行解码
    try:
        rst = blstm_crf.add_blstm_crf_layer()
    except Exception as e:
        print(str(e))
    return rst
Ejemplo n.º 6
0
def create_model(bert_config,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 labels,
                 num_labels,
                 use_one_hot_embeddings,
                 pos_ids,
                 dropout_rate=1.0,
                 lstm_size=1,
                 cell='lstm',
                 num_layers=1,
                 add_pos_embedding=True):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param pos_ids: 词性的idx 表示
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :return:
    """
    # 使用数据加载BertModel,获取对应的字embedding
    import tensorflow as tf
    from bert_base.bert import modeling
    model = modeling.BertModel(config=bert_config,
                               is_training=is_training,
                               input_ids=input_ids,
                               input_mask=input_mask,
                               token_type_ids=segment_ids,
                               use_one_hot_embeddings=use_one_hot_embeddings)
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    embedding = model.get_sequence_output()
    # jzhang add
    if add_pos_embedding:
        embedding = embedding_addpos(embedding, pos_ids)
    max_seq_length = embedding.shape[1].value
    # 算序列真实长度
    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(
        used, reduction_indices=1)  # [batch_size] 大小的向量,包含了当前batch中的序列长度
    # 添加CRF output layer
    blstm_crf = BLSTM_CRF(embedded_chars=embedding,
                          hidden_unit=lstm_size,
                          cell_type=cell,
                          num_layers=num_layers,
                          dropout_rate=dropout_rate,
                          initializers=initializers,
                          num_labels=num_labels,
                          seq_length=max_seq_length,
                          labels=labels,
                          lengths=lengths,
                          is_training=is_training)
    # jzhang: 设置参数crf_only=True,控制模型只使用CRF进行解码
    #  如果设置crf_only=False,模型将使用BiLSTM-CRF作为Decoder
    rst = blstm_crf.add_blstm_crf_layer(crf_only=False)
    return rst