Exemple #1
0
def main():
    parser = argparse.ArgumentParser(description="Run feature extractor")
    parser.add_argument('--maxlen', default=512, type=int)
    parser.add_argument('--batch_size', default=4, type=int)
    parser.add_argument('--source_data', default='./data/labeled_data.csv')
    parser.add_argument(
        '--pretrain_model',
        default='/home/david/pretrain_model/google_bert/chinese_L-12_H-768_A-12'
    )
    parser.add_argument('--finetune_model', default='./best_model.weights')
    parser.add_argument('--finetune', default=False, type=bool)
    parser.add_argument('--layer_name',
                        default='Transformer-11-FeedForward-Norm')
    parser.add_argument('--task', default='labeled')
    args = parser.parse_args()
    print(args)

    maxlen = args.maxlen
    source_data_path = args.source_data

    pretrain_model = args.pretrain_model
    config_path = os.path.join(pretrain_model, 'bert_config.json')
    checkpoint_path = os.path.join(pretrain_model, 'bert_model.ckpt')
    dict_path = os.path.join(pretrain_model, 'vocab.txt')

    label2id = {'时政': 0, '房产': 1, '财经': 2, '科技': 3, '时尚': 4, '教育': 5, '家居': 6}

    def build_model():
        learning_rate = 1e-5
        bert = build_transformer_model(
            config_path,
            checkpoint_path,
            return_keras_model=False,
        )

        output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
        output = Dense(units=len(label2id),
                       activation='softmax',
                       kernel_initializer=bert.initializer)(output)

        model = keras.models.Model(bert.model.input, output)
        # model.summary()

        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(learning_rate),  # 用足够小的学习率
            metrics=['accuracy'],
        )
        return model

    def load_content(filename):
        df = pd.read_csv(filename)
        text = []
        label = []
        if args.task == 'labeled':
            for row in df.itertuples():
                text.append(row.content)
                label.append(label2id[row.class_label])
        else:
            for t in df['content']:
                text.append(t)

        return text, label

    data, label = load_content(source_data_path)

    # 建立分词器
    tokenizer = Tokenizer(dict_path, do_lower_case=True)

    if args.finetune:
        model = build_model()
        model.load_weights(args.finetune_model)
        model = Model(inputs=model.input,
                      outputs=model.get_layer(args.layer_name).output)

        model.summary()
    else:
        model = build_transformer_model(config_path, checkpoint_path)
        model.summary()

    cls_vectors = []
    mean_vectors = []
    for text in tqdm(data):
        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
        cls_fea = model.predict(
            [np.array([token_ids]),
             np.array([segment_ids])])[0][0]
        mean_fea = np.mean(model.predict(
            [np.array([token_ids]),
             np.array([segment_ids])])[0],
                           axis=0)
        assert cls_fea.shape[0] == mean_fea.shape[0]
        cls_vectors.append(cls_fea)
        mean_vectors.append(mean_fea)

    print('Save data')
    np.savetxt(
        './output/{}_cls_features.txt'.format(
            'pretrain' if not args.finetune else 'finetune'), cls_vectors)
    np.savetxt(
        './output/{}_mean_features.txt'.format(
            'pretrain' if not args.finetune else 'finetune'), mean_vectors)
    if args.task == 'labeled':
        np.savetxt('./output/labels.txt', np.array(label))
Exemple #2
0
def get_tokenizer(dict_path):
    """建立分词器
    """
    return Tokenizer(dict_path, do_lower_case=True)
Exemple #3
0
# bert配置
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'

# 训练样本。THUCNews数据集,每个样本保存为一个txt。
txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids = [], []
        for is_end, txt in self.sample(random):
            text = open(txt, encoding='utf-8').read()
            text = text.split('\n')
            if len(text) > 1:
                title = text[0]
                content = '\n'.join(text[1:])
                token_ids, segment_ids = tokenizer.encode(content,
                                                          title,
Exemple #4
0
                kernel=self.config.kernel,
                bias=self.config.bias)
        return vecs_encode


if __name__ == '__main__':
    # 存储模型等
    from bertWhiteConf import bert_white_config

    bert_white_model = BertSimModel(bert_white_config)
    bert_white_model.load_pretrain_model()
    bert_white_model.save_model_builder()

    from bertWhiteConf import bert_white_config
    config = Namespace(**bert_white_config)
    tokenizer = Tokenizer(os.path.join(config.bert_dir, config.dict_path),
                          do_lower_case=True)
    text = "你还会什么"
    token_id = tokenizer.encode(text, max_length=config.maxlen)
    print(token_id)
"""
# cpu
docker run -t --rm -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:latest

# gpu
docker run --runtime=nvidia -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:1.14.0-gpu

# remarks
batch-size还可以配置batch.cfg等文件

# health testing
curl http://127.0.0.1:8532/v1/models/chatbot_tf
#! -*- coding: utf-8 -*-
# 基本测试:中文GPT2_ML模型
# 介绍链接:https://kexue.fm/archives/7292

import numpy as np
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import AutoRegressiveDecoder
from bert4keras.snippets import uniout

config_path = '/root/kg/bert/gpt2_ml/config.json'
checkpoint_path = '/root/kg/bert/gpt2_ml/model.ckpt-100000'
dict_path = '/root/kg/bert/gpt2_ml/vocab.txt'

tokenizer = Tokenizer(dict_path,
                      token_start=None,
                      token_end=None,
                      do_lower_case=True)  # 建立分词器

model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='gpt2_ml')  # 建立模型,加载权重


class ArticleCompletion(AutoRegressiveDecoder):
    """基于随机采样的文章续写
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids = np.concatenate([inputs[0], output_ids], 1)
        return self.last_token(model).predict(token_ids)
from bert4keras.optimizers import Adam
from bert4keras.backend import K, batch_gather, keras
from bert4keras.layers import LayerNormalization
from keras.layers import *
from keras.models import Model
# import os, sys
# sys.path.append(os.getcwd())
from test4nlp.event_extract_chq.config import event_extract_config as Config
from test4nlp.event_extract_chq.train.utils.data_utils import get_data, data_generator
from tqdm import tqdm
import json
import numpy as np

train_data, valid_data = get_data(Config.read_data_path)
# 建立分词器
tokenizer = Tokenizer(Config.dict_path, do_lower_case=True)


def extrac_trigger(inputs):
    "根据subject_ids从output中取出subject的向量表征"
    output, trigger_ids = inputs
    trigger_ids = K.cast(trigger_ids, 'int32')
    start = batch_gather(output, trigger_ids[:, :1])
    end = batch_gather(output, trigger_ids[:, 1:])
    trigger = K.concatenate([start, end], 2)
    return trigger[:, 0]


def build_model():
    bert_model = build_transformer_model(
        config_path=Config.config_path,
Exemple #7
0
keep_tokens = [2] * 106 + keep_tokens
keep_tokens_inv = {j: i for i, j in enumerate(keep_tokens)}

compound_tokens = []
for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
    if t not in new_token_dict:
        new_token_dict[t] = len(new_token_dict)
        ids = [keep_tokens_inv.get(i, 0) for i in sp_tokenizer.encode(t)[0]]
        compound_tokens.append(ids)

save_vocab(dict_path_2, new_token_dict)

# 构建分词器
tokenizer = Tokenizer(new_token_dict,
                      do_lower_case=True,
                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))


def corpus():
    """语料生成器
    """
    while True:
        f = '/root/data_pretrain/data_shuf.json'
        with open(f) as f:
            for l in f:
                l = json.loads(l)
                for texts in text_process(l['text']):
                    yield texts

Exemple #8
0
# -*- coding: utf-8 -*-
"""
@Time    : 2021/6/1 11:05
@Author  : huangkai21
@file    : basic_feature_extract.py
"""
# ! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征
"""
将vocab里的每一个字转化为向量形式,保存为json字典
"""
import numpy as np
from bert4keras.backend import keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
# from bert4keras.snippets import to_array

def to_array(*args):
    """批量转numpy的array
    """
    results = [np.array(a) for a in args]
    if len(args) == 1:
        return results[0]
    else:
        return results

config_path = r'E:\code\chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = r'E:\code\chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = r'E:\code\chinese_L-12_H-768_A-12/vocab.txt'
tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
from bert4keras.tokenizers import Tokenizer
from config import config
from sklearn.preprocessing import LabelEncoder
import os
import tensorflow as tf
import numpy as np
from Load_Data import load_data
from tqdm import tqdm

tokenizer = Tokenizer(os.path.join(config.model_dir, 'vocab.txt'),
                      do_lower_case=True)


def _tokenize(text):
    token_id, seg_id = tokenizer.encode(text)
    return token_id, seg_id


def _pad_seuqences(tokens):
    return tf.keras.preprocessing.sequence.pad_sequences(
        tokens, maxlen=config.seq_maxlen, truncating='post', padding='post')


#分割成两个token
def tokenize_data(data):
    token_ids_1 = []
    token_ids_2 = []
    seg_ids_1 = []
    seg_ids_2 = []
    tags = []
    for sent in tqdm(data):
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_path',
                        type=str,
                        required=True,
                        help='BERT配置文件路径')
    parser.add_argument('--checkpoint_path',
                        type=str,
                        required=True,
                        help='BERT权重路径')
    parser.add_argument('--dict_path', type=str, required=True, help='词表路径')
    parser.add_argument('--train_data_path',
                        type=str,
                        required=True,
                        help='训练集路径')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch_size')
    parser.add_argument('--lr',
                        default=1e-5,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--topk1',
                        default=25,
                        type=int,
                        required=False,
                        help='最大长度')
    parser.add_argument('--topk2',
                        default=2,
                        type=int,
                        required=False,
                        help='最大长度')
    parser.add_argument('--max_seq_len',
                        default=256,
                        type=int,
                        required=False,
                        help='最大长度')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    maxlen = args.max_seq_len
    config_path = args.config_path
    checkpoint_path = args.checkpoint_path
    dict_path = args.dict_path
    batch_size = args.batch_size
    epochs = args.epochs
    topk1 = args.topk1
    topk2 = args.topk2
    num_classes = 2
    lr = args.lr

    config_path = args.config_path
    checkpoint_path = args.checkpoint_path
    dict_path = args.dict_path
    train_data = args.train_data_path

    token_dict, keep_tokens = load_vocab(
        dict_path=dict_path,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    tokenizer = Tokenizer(token_dict, do_lower_case=True)

    train_df = pd.read_csv(train_data, sep='\t', header=None)
    train_df.columns = ['s1', 's2', 'label']

    class data_generator(DataGenerator):
        """数据生成器
        """
        def __iter__(self, r=False):
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            batch_token_ids, batch_segment_ids, batch_labels = [], [], []
            for i in idxs:
                line = self.data.loc[i]
                if (random.random() < 0.5):
                    s1 = line['s1'].replace('***', '*')
                    s2 = line['s2'].replace('***', '*')
                else:
                    s2 = line['s1'].replace('***', '*')
                    s1 = line['s2'].replace('***', '*')
                token_ids, segment_ids = tokenizer.encode(s1,
                                                          s2,
                                                          max_length=maxlen)
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append([line['label']])
                if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                    batch_token_ids = sequence_padding(batch_token_ids)
                    batch_segment_ids = sequence_padding(batch_segment_ids)
                    batch_labels = sequence_padding(batch_labels)
                    yield [batch_token_ids, batch_segment_ids,
                           batch_labels], None
                    batch_token_ids, batch_segment_ids, batch_labels = [], [], []

    class CrossEntropy(Loss):
        """交叉熵作为loss,并mask掉padding部分
        """
        def compute_loss(self, inputs, mask=None):
            y_true, y_pred = inputs
            if mask[1] is None:
                y_mask = 1.0
            else:
                y_mask = K.cast(mask[1], K.floatx())[:, 1:]
            y_true = y_true[:, 1:]  # 目标token_ids
            y_pred = y_pred[:, :-1]  # 预测序列,错开一位
            loss = K.sparse_categorical_crossentropy(y_true, y_pred)
            loss = K.sum(loss * y_mask) / K.sum(y_mask)
            return loss

    c_in = Input(shape=(1, ))
    c = Embedding(num_classes, maxlen)(c_in)
    c = Reshape((maxlen, ))(c)

    model = build_transformer_model(
        config_path,
        checkpoint_path,
        application='lm',
        keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
        layer_norm_cond=c,
        additional_input_layers=c_in,
    )
    output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
    model = Model(model.inputs, output)
    model.compile(optimizer=Adam(lr))
    model.summary()

    def random_generate(c=0, n=2, s1_topk=5):
        """随机采样生成句子对
        每次从最高概率的topk个token中随机采样一个
        """
        label_ids = [[c] for _ in range(n)]
        target_ids = [[2] for _ in range(n)]
        sep_index = [0 for _ in range(n)]
        R = []
        for i in range(64):
            segment_ids = []
            for t, index in zip(target_ids, sep_index):
                if index > 0:
                    segment_ids.append([0] * index + [1] * (len(t) - index))
                else:
                    segment_ids.append([0] * len(t))
            # 下面直接忽略[PAD], [UNK], [CLS]
            _probas = model.predict([target_ids, segment_ids,
                                     label_ids])[:, -1, 3:]
            for j, p in enumerate(_probas):
                p_arg_topk = p.argsort()[::-1][:s1_topk]
                #if 0 in p_arg_topk:
                #    target_ids[j].append(3)
                #else:
                p_topk = p[p_arg_topk]
                p = p_topk / sum(p_topk)
                idx = np.random.choice(len(p), p=p)
                target_ids[j].append(p_arg_topk[idx] + 3)

                if p_arg_topk[idx] + 3 == 3 and sep_index[j] == 0:
                    sep_index[j] = i
        for tokens in target_ids:
            tokens.append(3)
            cls_index = tokens.index(3)
            R.append(tokenizer.decode(tokens[:cls_index]))
            #sentences.sort(key = lambda i:len(i),reverse=True)
        return R

    def gen_sent(s, label, topk=2):
        """beam search解码
        每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索
        """
        label_ids = [[label] for _ in range(topk)]
        token_ids, segment_ids = tokenizer.encode(s)
        target_ids = [[] for _ in range(topk)]  # 候选答案id
        target_scores = [0] * topk  # 候选答案分数
        for i in range(64):  # 强制要求输出不超过max_output_len字
            _target_ids = [token_ids + t for t in target_ids]
            _segment_ids = [segment_ids + [1] * len(t) for t in target_ids]
            _probas = model.predict([_target_ids, _segment_ids,
                                     label_ids])[:, -1,
                                                 3:]  # 直接忽略[PAD], [UNK], [CLS]
            _log_probas = np.log(_probas + 1e-6)  # 取对数,方便计算
            _topk_arg = _log_probas.argsort(axis=1)[:, -topk:]  # 每一项选出topk
            _candidate_ids, _candidate_scores = [], []
            for j, (ids, sco) in enumerate(zip(target_ids, target_scores)):
                # 预测第一个字的时候,输入的topk事实上都是同一个,
                # 所以只需要看第一个,不需要遍历后面的。
                if i == 0 and j > 0:
                    continue
                for k in _topk_arg[j]:
                    _candidate_ids.append(ids + [k + 3])
                    _candidate_scores.append(sco + _log_probas[j][k])
            _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
            target_ids = [_candidate_ids[k] for k in _topk_arg]
            target_scores = [_candidate_scores[k] for k in _topk_arg]
            best_one = np.argmax(target_scores)
            if target_ids[best_one][-1] == 3:
                return tokenizer.decode(target_ids[best_one])
        # 如果max_output_len字都找不到结束符,直接返回
        return tokenizer.decode(target_ids[np.argmax(target_scores)])

    def gen_sen_pair(label, n, s1_topk, s2_topk):
        s1_pair = random_generate(label, n, s1_topk)
        output = []
        for line in s1_pair:
            s2 = gen_sent(line, label, s2_topk)
            output.append([line, s2])
        return output

    class Evaluate(keras.callbacks.Callback):
        def __init__(self):
            self.lowest = 1e10

        def on_epoch_end(self, epoch, logs=None):
            # 保存最优
            if logs['loss'] <= self.lowest:
                self.lowest = logs['loss']
                model.save_weights('./best_model.weights')
            print("正样本:")
            print(gen_sen_pair(1, 2, topk1, topk2))
            print("负样本:")
            print(gen_sen_pair(0, 2, topk1, topk2))

    train_generator = data_generator(train_df, batch_size)
    evaluator = Evaluate()
    model.fit_generator(train_generator.forfit(),
                        steps_per_epoch=len(train_generator),
                        epochs=epochs,
                        callbacks=[evaluator])
def load_model(model_file, bert_model_path, do_lower_case):
    global g_model, g_tokenizer
    g_model = keras.models.load_model(model_file)
    # print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
    dict_path = '%s/vocab.txt' % bert_model_path
    g_tokenizer = Tokenizer(dict_path, do_lower_case=do_lower_case)  # 建立分词器
Exemple #12
0
        """T5相对位置编码
        """
        if self.position_bias is None:

            x = inputs
            p = self.apply(inputs=[x, x],
                           layer=RelativePositionEmbeddingT5,
                           input_dim=32,
                           output_dim=self.num_attention_heads,
                           bidirectional=True,
                           embeddings_initializer=self.initializer,
                           name='Embedding-Relative-Position')
            self.position_bias = p

        return self.position_bias

    @classmethod
    def startswith(cls, inputs):
        return False


t5s_tokenizer = Tokenizer(pretrain_model_save_path + "vocab.txt")

t5s = build_transformer_model(
    config_path=pretrain_model_save_path + "t5s_config.json",
    model=T5SEncoder,
    # with_mlm='linear',
    return_keras_model=True,
)

t5s.load_weights(pretrain_model_save_path + "model.h5", by_name=True)
Exemple #13
0
from bert4keras.tokenizers import Tokenizer , load_vocab
import json
import numpy as np
dict_path = "vocab.txt"

tokenizer = Tokenizer(load_vocab(dict_path))
maskID = tokenizer.token_to_id(tokenizer._token_mask)



def write_Json(content,fileName):
    with open(fileName,"w") as f:
        json.dump(content,f,indent=2)


def read_json(fileName):
    fp = open(fileName,"r")
    f = json.load(fp)
    return f


def cal_mask(inputs,corrupts,labels):
    assert inputs.shape == corrupts.shape and corrupts.shape == labels.shape
    masked = (labels == 1)
    correct = (inputs == corrupts)
    masked = masked.astype(np.float)
    correct = correct.astype(np.float)
    mask = masked * correct
    return mask
                         data,
                         file_path,
                         write_sentences=True):
    with open(file_path, 'w', encoding='utf-8') as f:
        written_sum = 0
        for batch in predict_result:
            for line_result in batch:
                if write_sentences:
                    f.write(data[written_sum][0] + '\n')
                f.write(str(line_result) + ' \n')
                written_sum += 1


sentiment_predictor = SentimentPredictor(CONFIG_PATH, CHECKPOINT_PATH,
                                         NUM_CLASSES)
tokenizer = Tokenizer(DICT_PATH, do_lower_case=True)
aim_list = ['motivation', 'experiment', 'readable', 'relatework', 'novel']


def main(aim):
    original_data = read_excel_data(EXCEL_DATA_PATH, aim)

    data = MyDataGenerator(original_data, tokenizer, MAX_LEN)
    sentiment_predictor.load_weights(os.path.join(MODEL_PATH, aim))
    predict_result = sentiment_predictor.predict(data)

    write_result_to_file(predict_result, original_data,
                         os.path.join(RESULT_PATH, aim + '.txt'))


if __name__ == '__main__':
Exemple #15
0
#读取schema
with open('/home/ycy/HBT/data/schema.json',encoding='utf8') as f:
    id2predicate, predicate2id, n = {}, {}, 0
    predicate2type = {}
    for l in f:
        l = json.loads(l)
        predicate2type[l['predicate']] = (l['subject_type'], l['object_type'])
        for k, _ in sorted(l['object_type'].items()):
            key = l['predicate'] + '_' + k
            id2predicate[n] = key
            predicate2id[key] = n
            n += 1
# tokenizer = OurTokenizer(vocab_file=BERT_PATH + "vocab.txt")
tokenizer = BertTokenizer.from_pretrained(model_path,do_lower=True)
tokenizer_k = Tokenizer(os.path.join(model_path,'vocab.txt'), do_lower_case=True)
def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到,返回第一个下标;否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1
def sequence_padding(inputs, length=None, padding=0):
    """Numpy函数,将序列padding到同一长度
    """
    if length is None:
        length = max([len(x) for x in inputs])
Exemple #16
0
                elif this_flag=='O' and last_flag != 'O':
                    d.append([char,'O'])
                elif this_flag[0] == 'B':
                    d.append([char,this_flag[2:]])
                else:
                    d[-1][0] +=char
                last_flag = this_flag
            D.append(d)

    return D

train_data = load_data('./data/example.train')
valid_data = load_data('./data/example.dev')
test_data = load_data('./data/example.test')

tokenizer = Tokenizer(token_dict=dict_path,do_lower_case=True)


#类别映射
labels = ['PER','LOC','ORG']
id2label = dict(enumerate(labels))
label2id = {j:i for i,j in id2label.items()}
num_labels = len(labels)*2 +1 #tag*BI + O

class data_generator(DataGenerator):

    def __iter__(self,random=False):
        batch_token_ids, batch_labels = [], []
        for is_end, item in self.sample(random):
            token_ids, labels = [tokenizer._token_start_id], [0]
            for w, l in item:
Exemple #17
0
def get_tokenizer(dict_path, pre_tokenize=None):
    """建立分词器
    """
    return Tokenizer(dict_path, do_lower_case=True, pre_tokenize=pre_tokenize)
Exemple #18
0
                elif this_flag[:1] == 'B':
                    d.append([char, this_flag[2:]])
                else:
                    d[-1][0] += char
                last_flag = this_flag
            D.append(d)
    return D


# 标注数据
train_data = load_data('data/china-people-daily-ner-corpus/example.train')
valid_data = load_data('data/china-people-daily-ner-corpus/example.dev')
test_data = load_data('data/china-people-daily-ner-corpus/example.test')

# 建立分词器
tokenizer = Tokenizer(vocab_path, do_lower_case=True)


class data_generator(DataGenerator):
    """数据生成器
    """

    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_start, batch_end = [], [], [], []
        for is_end, item in self.sample(random):
            for k, v in query_mapping.items():

                query_token_ids, query_segment_ids = tokenizer.encode(v)
                token_ids = query_token_ids.copy()
                start = query_segment_ids.copy()
                end = query_segment_ids.copy()
Exemple #19
0
import numpy as np
from bert import tokenization
from tqdm import tqdm
from config import Config
import pandas as pd
import os
from bert4keras.tokenizers import Tokenizer
from bert.data_utils import split_text
vocab_file = Config().vocab_file
do_lower_case = True
re_tokenzier = Tokenizer(vocab_file, do_lower_case)
config = Config()


def load_data(data_file):
    """
    读取数据
    :param file:
    :return:
    """
    data_df = pd.read_csv(data_file)
    data_df.fillna('', inplace=True)
    lines = list(
        zip(list(data_df['id']), list(data_df['question']),
            list(data_df['context']), list(data_df['answer']),
            list(data_df['answer_start'])))
    return lines


def create_example(lines):
    examples = []
 def set_dict_path(self, path):
     self.dict = path
     self.tokenizer = Tokenizer(self.dict, do_lower_case=True)
Exemple #21
0
def build_tokenizer(dict_path):
    '''加载tokenizer'''
    tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
    return tokenizer
domain_label2id, domain_id2label, intent_label2id, intent_id2label, slot_label2id, slot_id2label = json.load(
    open('data/labels.json', 'r', encoding='utf-8'))

# 加载数据
data = json.load(open('data/train.json', 'r', encoding='utf-8'))
random.shuffle(data)
valid_data = data[:len(data) // 8]
train_data = data[len(data) // 8:]

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=vocab_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

tokenizer = Tokenizer(token_dict)


# 数据迭代器
class MyDataGenerator(DataGenerator):
    def __init__(self, data, batch_size=32, buffer_size=None):
        super(MyDataGenerator, self).__init__(data, batch_size, buffer_size)

    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, Y1, Y2, Y3 = [], [], [], [], []

        for is_end, item in self.sample(random):

            text = item['text']
            token_ids, segment_ids = tokenizer.encode(first_text=text.lower())
Exemple #23
0
 def load_pretrain_model(self):
     """ 加载预训练模型, 和tokenizer """
     self.tokenizer = Tokenizer(os.path.join(self.config.bert_dir,
                                             self.config.dict_path),
                                do_lower_case=True)
     # bert-load
     if self.config.pooling == "pooler":
         bert = build_transformer_model(
             os.path.join(self.config.bert_dir, self.config.config_path),
             os.path.join(self.config.bert_dir,
                          self.config.checkpoint_path),
             model=self.config.model,
             with_pool="linear")
     else:
         bert = build_transformer_model(
             os.path.join(self.config.bert_dir, self.config.config_path),
             os.path.join(self.config.bert_dir,
                          self.config.checkpoint_path),
             model=self.config.model)
     # output-layers
     outputs, count = [], 0
     while True:
         try:
             output = bert.get_layer("Transformer-%d-FeedForward-Norm" %
                                     count).output
             outputs.append(output)
             count += 1
         except:
             break
     # pooling
     if self.config.pooling == "first-last-avg":
         outputs = [
             NonMaskingLayer()(output_i)
             for output_i in [outputs[0], outputs[-1]]
         ]
         outputs = [
             keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs
         ]
         output = keras.layers.Average()(outputs)
     elif self.config.pooling == "first-last-max":
         outputs = [
             NonMaskingLayer()(output_i)
             for output_i in [outputs[0], outputs[-1]]
         ]
         outputs = [keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs]
         output = keras.layers.Average()(outputs)
     elif self.config.pooling == "cls-max-avg":
         outputs = [
             NonMaskingLayer()(output_i)
             for output_i in [outputs[0], outputs[-1]]
         ]
         outputs_cls = [
             keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in outputs
         ]
         outputs_max = [
             keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs
         ]
         outputs_avg = [
             keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs
         ]
         output = keras.layers.Concatenate()(outputs_cls + outputs_avg)
     elif self.config.pooling == "last-avg":
         output = keras.layers.GlobalAveragePooling1D()(outputs[-1])
     elif self.config.pooling == "cls-3":
         outputs = [
             keras.layers.Lambda(lambda x: x[:, 0])(fs)
             for fs in [outputs[0], outputs[-1], outputs[-2]]
         ]
         output = keras.layers.Concatenate()(outputs)
     elif self.config.pooling == "cls-2":
         outputs = [
             keras.layers.Lambda(lambda x: x[:, 0])(fs)
             for fs in [outputs[0], outputs[-1]]
         ]
         output = keras.layers.Concatenate()(outputs)
     elif self.config.pooling == "cls-1":
         output = keras.layers.Lambda(lambda x: x[:, 0])(outputs[-1])
     elif self.config.pooling == "pooler":
         output = bert.output
     # 加载句FAQ标准问的句向量, 并当成一个常量参与余弦相似度的计算
     docs_encode = np.loadtxt(
         os.path.join(self.config.save_dir, self.config.path_docs_encode))
     # 余弦相似度的层
     score_cosine = CosineLayer(docs_encode)(output)
     # 最后的编码器
     self.bert_white_encoder = Model(bert.inputs, score_cosine)
     print("load bert_white_encoder success!")
Exemple #24
0
from window_layers import WindowEmbedding, WindowEmbeddingforword

epochs = 500
batch_size = 1024
# bert_layers = 12
learing_rate = 1e-4  # bert_layers越小,学习率应该要越大
seq_crf_lr_multiplier = 1e-2  # 必要时扩大CRF层的学习率
tag_crf_lr_multiplier = 1e-2
vocab_size = 21128

# bert配置
# config_path = '../../Q_A/publish/bert_config.json'
# checkpoint_path = '../../Q_A/publish/bert_model.ckpt'
dict_path = '../../Q_A/publish/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=False)

labels = [
    'O', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R20', 'R21', 'R22', 'R23',
    'R24', 'R25', 'R30', 'R31', 'R90', 'R99', 'X'
]
seg_labels = ['O', 'B', 'I', 'E']

id2label = dict(enumerate(labels))
label2id = {j: i for i, j in id2label.items()}
num_labels = len(labels)

id2seglabel = dict(enumerate(seg_labels))
seglabel2id = {j: i for i, j in id2seglabel.items()}
num_seglabels = len(seg_labels)
Exemple #25
0
compound_tokens = []

for t, i in sorted(token_dict.items(), key=lambda s: s[1]):
    # 这里主要考虑两种情况:1、首字母大写;2、整个单词大写。
    # Python2下,新增了5594个token;Python3下,新增了5596个token。
    tokens = []
    if t.isalpha():
        tokens.extend([t[:1].upper() + t[1:], t.upper()])
    elif t[:2] == '##' and t[2:].isalpha():
        tokens.append(t.upper())
    for token in tokens:
        if token not in new_token_dict:
            compound_tokens.append([i])
            new_token_dict[token] = len(new_token_dict)

tokenizer = Tokenizer(new_token_dict, do_lower_case=False)

model = build_transformer_model(
    config_path,
    checkpoint_path,
    compound_tokens=compound_tokens,  # 增加新token,用旧token平均来初始化
)

text = u'Welcome to BEIJING.'
tokens = tokenizer.tokenize(text)
print(tokens)
"""
输出:['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]']
"""

token_ids, segment_ids = tokenizer.encode(text)
Exemple #26
0
import jieba
from roformer import RoFormerTokenizer
from bert4keras.tokenizers import Tokenizer

dict_path = 'pretrained_models/chinese_roformer_base'
text = "12312格ab局A B cdA,.567 861351 684!今天萨达天 气非常好王企。文保鹅按时发放了的撒这些seqetvgsa国内拉手的喀什。、]P[,./()*7656&【;,‘"
#text = "这里基本保留了唐宋遗留下来的坊巷格局和大量明清古建筑,其中各级文保单位29处,被誉为“里坊制度的活化石”“明清建筑博物馆”!"
bert4keras_tokenizer = Tokenizer(
    dict_path + "/vocab.txt",
    do_lower_case=True,
    pre_tokenize=lambda s: jieba.cut(s, HMM=False))
roformer_tokenizer = RoFormerTokenizer.from_pretrained(dict_path)

bert4keras_tokenizer_input_ids = bert4keras_tokenizer.encode(text)[0]
roformer_tokenizer_input_ids = roformer_tokenizer.encode(text)

print(bert4keras_tokenizer_input_ids == roformer_tokenizer_input_ids)
Exemple #27
0
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            text, label = l.strip().split('\t')
            D.append((text[:128], int(label)))
    return D


# 加载数据集
train_data = load_data('datasets/sentiment/sentiment.train.data')
valid_data = load_data('datasets/sentiment/sentiment.valid.data')
test_data = load_data('datasets/sentiment/sentiment.test.data')

# 建立分词器
tokenizer = Tokenizer(dict_path,
                      do_lower_case=True,
                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
Exemple #28
0
    __, last_part = line.split(':')
    ignore_flag = False
    for dis_word in disallowed_words:
        if dis_word in last_part:
            ignore_flag = True
            break
    if ignore_flag:
        continue
    # 长度不能超过最大长度
    if len(last_part) > max_len - 2:
        continue
    poetry.append(last_part)

# 预训练模型中的词典和分词器
_token_dict = load_vocab(dict_path)
_tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 统计所有词的词频
word_frequency_count = defaultdict(int)
for line in poetry:
    for t in _tokenizer.tokenize(line):
        word_frequency_count[t] += 1
# 过滤掉低频词
tokens = [(token, count) for token, count in word_frequency_count.items()
          if count >= min_word_frequency]
# 按词频排序
tokens = sorted(tokens, key=lambda x: -x[1])
# 去掉词频,只保留词列表
tokens = [token for token, count in tokens]

# 构建新的token->id映射关系、和新词表
Exemple #29
0
valid_data = load_data('dev_data/balanced_dev.json')

# 读取schema  事件模式
with open('event_schema/event_schema.json') as f:
    id2label, label2id, n = {}, {}, 0
    for l in f:
        l = json.loads(l)
        for role in l['role_list']:
            key = (l['event_type'], role['role'])
            id2label[n] = key
            label2id[key] = n
            n += 1
    num_labels = len(id2label) * 2 + 1

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)


def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到,返回第一个下标;否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1


class data_generator(DataGenerator):
    """数据生成器
 def setUpClass(cls) -> None:
     cls.my_tokenizer = BertTokenizer(dict_path, ignore_case=True)
     cls.sjl_tokenizer = Tokenizer(dict_path, do_lower_case=True)