def main(): parser = argparse.ArgumentParser(description="Run feature extractor") parser.add_argument('--maxlen', default=512, type=int) parser.add_argument('--batch_size', default=4, type=int) parser.add_argument('--source_data', default='./data/labeled_data.csv') parser.add_argument( '--pretrain_model', default='/home/david/pretrain_model/google_bert/chinese_L-12_H-768_A-12' ) parser.add_argument('--finetune_model', default='./best_model.weights') parser.add_argument('--finetune', default=False, type=bool) parser.add_argument('--layer_name', default='Transformer-11-FeedForward-Norm') parser.add_argument('--task', default='labeled') args = parser.parse_args() print(args) maxlen = args.maxlen source_data_path = args.source_data pretrain_model = args.pretrain_model config_path = os.path.join(pretrain_model, 'bert_config.json') checkpoint_path = os.path.join(pretrain_model, 'bert_model.ckpt') dict_path = os.path.join(pretrain_model, 'vocab.txt') label2id = {'时政': 0, '房产': 1, '财经': 2, '科技': 3, '时尚': 4, '教育': 5, '家居': 6} def build_model(): learning_rate = 1e-5 bert = build_transformer_model( config_path, checkpoint_path, return_keras_model=False, ) output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) output = Dense(units=len(label2id), activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) # model.summary() model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate), # 用足够小的学习率 metrics=['accuracy'], ) return model def load_content(filename): df = pd.read_csv(filename) text = [] label = [] if args.task == 'labeled': for row in df.itertuples(): text.append(row.content) label.append(label2id[row.class_label]) else: for t in df['content']: text.append(t) return text, label data, label = load_content(source_data_path) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) if args.finetune: model = build_model() model.load_weights(args.finetune_model) model = Model(inputs=model.input, outputs=model.get_layer(args.layer_name).output) model.summary() else: model = build_transformer_model(config_path, checkpoint_path) model.summary() cls_vectors = [] mean_vectors = [] for text in tqdm(data): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) cls_fea = model.predict( [np.array([token_ids]), np.array([segment_ids])])[0][0] mean_fea = np.mean(model.predict( [np.array([token_ids]), np.array([segment_ids])])[0], axis=0) assert cls_fea.shape[0] == mean_fea.shape[0] cls_vectors.append(cls_fea) mean_vectors.append(mean_fea) print('Save data') np.savetxt( './output/{}_cls_features.txt'.format( 'pretrain' if not args.finetune else 'finetune'), cls_vectors) np.savetxt( './output/{}_mean_features.txt'.format( 'pretrain' if not args.finetune else 'finetune'), mean_vectors) if args.task == 'labeled': np.savetxt('./output/labels.txt', np.array(label))
def get_tokenizer(dict_path): """建立分词器 """ return Tokenizer(dict_path, do_lower_case=True)
# bert配置 config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt' # 训练样本。THUCNews数据集,每个样本保存为一个txt。 txts = glob.glob('/root/thuctc/THUCNews/*/*.txt') # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids = [], [] for is_end, txt in self.sample(random): text = open(txt, encoding='utf-8').read() text = text.split('\n') if len(text) > 1: title = text[0] content = '\n'.join(text[1:]) token_ids, segment_ids = tokenizer.encode(content, title,
kernel=self.config.kernel, bias=self.config.bias) return vecs_encode if __name__ == '__main__': # 存储模型等 from bertWhiteConf import bert_white_config bert_white_model = BertSimModel(bert_white_config) bert_white_model.load_pretrain_model() bert_white_model.save_model_builder() from bertWhiteConf import bert_white_config config = Namespace(**bert_white_config) tokenizer = Tokenizer(os.path.join(config.bert_dir, config.dict_path), do_lower_case=True) text = "你还会什么" token_id = tokenizer.encode(text, max_length=config.maxlen) print(token_id) """ # cpu docker run -t --rm -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:latest # gpu docker run --runtime=nvidia -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:1.14.0-gpu # remarks batch-size还可以配置batch.cfg等文件 # health testing curl http://127.0.0.1:8532/v1/models/chatbot_tf
#! -*- coding: utf-8 -*- # 基本测试:中文GPT2_ML模型 # 介绍链接:https://kexue.fm/archives/7292 import numpy as np from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import AutoRegressiveDecoder from bert4keras.snippets import uniout config_path = '/root/kg/bert/gpt2_ml/config.json' checkpoint_path = '/root/kg/bert/gpt2_ml/model.ckpt-100000' dict_path = '/root/kg/bert/gpt2_ml/vocab.txt' tokenizer = Tokenizer(dict_path, token_start=None, token_end=None, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml') # 建立模型,加载权重 class ArticleCompletion(AutoRegressiveDecoder): """基于随机采样的文章续写 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids = np.concatenate([inputs[0], output_ids], 1) return self.last_token(model).predict(token_ids)
from bert4keras.optimizers import Adam from bert4keras.backend import K, batch_gather, keras from bert4keras.layers import LayerNormalization from keras.layers import * from keras.models import Model # import os, sys # sys.path.append(os.getcwd()) from test4nlp.event_extract_chq.config import event_extract_config as Config from test4nlp.event_extract_chq.train.utils.data_utils import get_data, data_generator from tqdm import tqdm import json import numpy as np train_data, valid_data = get_data(Config.read_data_path) # 建立分词器 tokenizer = Tokenizer(Config.dict_path, do_lower_case=True) def extrac_trigger(inputs): "根据subject_ids从output中取出subject的向量表征" output, trigger_ids = inputs trigger_ids = K.cast(trigger_ids, 'int32') start = batch_gather(output, trigger_ids[:, :1]) end = batch_gather(output, trigger_ids[:, 1:]) trigger = K.concatenate([start, end], 2) return trigger[:, 0] def build_model(): bert_model = build_transformer_model( config_path=Config.config_path,
keep_tokens = [2] * 106 + keep_tokens keep_tokens_inv = {j: i for i, j in enumerate(keep_tokens)} compound_tokens = [] for t, _ in sorted(token_dict.items(), key=lambda s: s[1]): if t not in new_token_dict: new_token_dict[t] = len(new_token_dict) ids = [keep_tokens_inv.get(i, 0) for i in sp_tokenizer.encode(t)[0]] compound_tokens.append(ids) save_vocab(dict_path_2, new_token_dict) # 构建分词器 tokenizer = Tokenizer(new_token_dict, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) def corpus(): """语料生成器 """ while True: f = '/root/data_pretrain/data_shuf.json' with open(f) as f: for l in f: l = json.loads(l) for texts in text_process(l['text']): yield texts
# -*- coding: utf-8 -*- """ @Time : 2021/6/1 11:05 @Author : huangkai21 @file : basic_feature_extract.py """ # ! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 """ 将vocab里的每一个字转化为向量形式,保存为json字典 """ import numpy as np from bert4keras.backend import keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer # from bert4keras.snippets import to_array def to_array(*args): """批量转numpy的array """ results = [np.array(a) for a in args] if len(args) == 1: return results[0] else: return results config_path = r'E:\code\chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = r'E:\code\chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = r'E:\code\chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器
from bert4keras.tokenizers import Tokenizer from config import config from sklearn.preprocessing import LabelEncoder import os import tensorflow as tf import numpy as np from Load_Data import load_data from tqdm import tqdm tokenizer = Tokenizer(os.path.join(config.model_dir, 'vocab.txt'), do_lower_case=True) def _tokenize(text): token_id, seg_id = tokenizer.encode(text) return token_id, seg_id def _pad_seuqences(tokens): return tf.keras.preprocessing.sequence.pad_sequences( tokens, maxlen=config.seq_maxlen, truncating='post', padding='post') #分割成两个token def tokenize_data(data): token_ids_1 = [] token_ids_2 = [] seg_ids_1 = [] seg_ids_2 = [] tags = [] for sent in tqdm(data):
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config_path', type=str, required=True, help='BERT配置文件路径') parser.add_argument('--checkpoint_path', type=str, required=True, help='BERT权重路径') parser.add_argument('--dict_path', type=str, required=True, help='词表路径') parser.add_argument('--train_data_path', type=str, required=True, help='训练集路径') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch_size') parser.add_argument('--lr', default=1e-5, type=float, required=False, help='学习率') parser.add_argument('--topk1', default=25, type=int, required=False, help='最大长度') parser.add_argument('--topk2', default=2, type=int, required=False, help='最大长度') parser.add_argument('--max_seq_len', default=256, type=int, required=False, help='最大长度') args = parser.parse_args() print('args:\n' + args.__repr__()) maxlen = args.max_seq_len config_path = args.config_path checkpoint_path = args.checkpoint_path dict_path = args.dict_path batch_size = args.batch_size epochs = args.epochs topk1 = args.topk1 topk2 = args.topk2 num_classes = 2 lr = args.lr config_path = args.config_path checkpoint_path = args.checkpoint_path dict_path = args.dict_path train_data = args.train_data_path token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) train_df = pd.read_csv(train_data, sep='\t', header=None) train_df.columns = ['s1', 's2', 'label'] class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, r=False): idxs = list(range(len(self.data))) np.random.shuffle(idxs) batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for i in idxs: line = self.data.loc[i] if (random.random() < 0.5): s1 = line['s1'].replace('***', '*') s2 = line['s2'].replace('***', '*') else: s2 = line['s1'].replace('***', '*') s1 = line['s2'].replace('***', '*') token_ids, segment_ids = tokenizer.encode(s1, s2, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([line['label']]) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids, batch_labels], None batch_token_ids, batch_segment_ids, batch_labels = [], [], [] class CrossEntropy(Loss): """交叉熵作为loss,并mask掉padding部分 """ def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs if mask[1] is None: y_mask = 1.0 else: y_mask = K.cast(mask[1], K.floatx())[:, 1:] y_true = y_true[:, 1:] # 目标token_ids y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss c_in = Input(shape=(1, )) c = Embedding(num_classes, maxlen)(c_in) c = Reshape((maxlen, ))(c) model = build_transformer_model( config_path, checkpoint_path, application='lm', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 layer_norm_cond=c, additional_input_layers=c_in, ) output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) model = Model(model.inputs, output) model.compile(optimizer=Adam(lr)) model.summary() def random_generate(c=0, n=2, s1_topk=5): """随机采样生成句子对 每次从最高概率的topk个token中随机采样一个 """ label_ids = [[c] for _ in range(n)] target_ids = [[2] for _ in range(n)] sep_index = [0 for _ in range(n)] R = [] for i in range(64): segment_ids = [] for t, index in zip(target_ids, sep_index): if index > 0: segment_ids.append([0] * index + [1] * (len(t) - index)) else: segment_ids.append([0] * len(t)) # 下面直接忽略[PAD], [UNK], [CLS] _probas = model.predict([target_ids, segment_ids, label_ids])[:, -1, 3:] for j, p in enumerate(_probas): p_arg_topk = p.argsort()[::-1][:s1_topk] #if 0 in p_arg_topk: # target_ids[j].append(3) #else: p_topk = p[p_arg_topk] p = p_topk / sum(p_topk) idx = np.random.choice(len(p), p=p) target_ids[j].append(p_arg_topk[idx] + 3) if p_arg_topk[idx] + 3 == 3 and sep_index[j] == 0: sep_index[j] = i for tokens in target_ids: tokens.append(3) cls_index = tokens.index(3) R.append(tokenizer.decode(tokens[:cls_index])) #sentences.sort(key = lambda i:len(i),reverse=True) return R def gen_sent(s, label, topk=2): """beam search解码 每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索 """ label_ids = [[label] for _ in range(topk)] token_ids, segment_ids = tokenizer.encode(s) target_ids = [[] for _ in range(topk)] # 候选答案id target_scores = [0] * topk # 候选答案分数 for i in range(64): # 强制要求输出不超过max_output_len字 _target_ids = [token_ids + t for t in target_ids] _segment_ids = [segment_ids + [1] * len(t) for t in target_ids] _probas = model.predict([_target_ids, _segment_ids, label_ids])[:, -1, 3:] # 直接忽略[PAD], [UNK], [CLS] _log_probas = np.log(_probas + 1e-6) # 取对数,方便计算 _topk_arg = _log_probas.argsort(axis=1)[:, -topk:] # 每一项选出topk _candidate_ids, _candidate_scores = [], [] for j, (ids, sco) in enumerate(zip(target_ids, target_scores)): # 预测第一个字的时候,输入的topk事实上都是同一个, # 所以只需要看第一个,不需要遍历后面的。 if i == 0 and j > 0: continue for k in _topk_arg[j]: _candidate_ids.append(ids + [k + 3]) _candidate_scores.append(sco + _log_probas[j][k]) _topk_arg = np.argsort(_candidate_scores)[-topk:] # 从中选出新的topk target_ids = [_candidate_ids[k] for k in _topk_arg] target_scores = [_candidate_scores[k] for k in _topk_arg] best_one = np.argmax(target_scores) if target_ids[best_one][-1] == 3: return tokenizer.decode(target_ids[best_one]) # 如果max_output_len字都找不到结束符,直接返回 return tokenizer.decode(target_ids[np.argmax(target_scores)]) def gen_sen_pair(label, n, s1_topk, s2_topk): s1_pair = random_generate(label, n, s1_topk) output = [] for line in s1_pair: s2 = gen_sent(line, label, s2_topk) output.append([line, s2]) return output class Evaluate(keras.callbacks.Callback): def __init__(self): self.lowest = 1e10 def on_epoch_end(self, epoch, logs=None): # 保存最优 if logs['loss'] <= self.lowest: self.lowest = logs['loss'] model.save_weights('./best_model.weights') print("正样本:") print(gen_sen_pair(1, 2, topk1, topk2)) print("负样本:") print(gen_sen_pair(0, 2, topk1, topk2)) train_generator = data_generator(train_df, batch_size) evaluator = Evaluate() model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator])
def load_model(model_file, bert_model_path, do_lower_case): global g_model, g_tokenizer g_model = keras.models.load_model(model_file) # print(model.predict([np.array([token_ids]), np.array([segment_ids])])) dict_path = '%s/vocab.txt' % bert_model_path g_tokenizer = Tokenizer(dict_path, do_lower_case=do_lower_case) # 建立分词器
"""T5相对位置编码 """ if self.position_bias is None: x = inputs p = self.apply(inputs=[x, x], layer=RelativePositionEmbeddingT5, input_dim=32, output_dim=self.num_attention_heads, bidirectional=True, embeddings_initializer=self.initializer, name='Embedding-Relative-Position') self.position_bias = p return self.position_bias @classmethod def startswith(cls, inputs): return False t5s_tokenizer = Tokenizer(pretrain_model_save_path + "vocab.txt") t5s = build_transformer_model( config_path=pretrain_model_save_path + "t5s_config.json", model=T5SEncoder, # with_mlm='linear', return_keras_model=True, ) t5s.load_weights(pretrain_model_save_path + "model.h5", by_name=True)
from bert4keras.tokenizers import Tokenizer , load_vocab import json import numpy as np dict_path = "vocab.txt" tokenizer = Tokenizer(load_vocab(dict_path)) maskID = tokenizer.token_to_id(tokenizer._token_mask) def write_Json(content,fileName): with open(fileName,"w") as f: json.dump(content,f,indent=2) def read_json(fileName): fp = open(fileName,"r") f = json.load(fp) return f def cal_mask(inputs,corrupts,labels): assert inputs.shape == corrupts.shape and corrupts.shape == labels.shape masked = (labels == 1) correct = (inputs == corrupts) masked = masked.astype(np.float) correct = correct.astype(np.float) mask = masked * correct return mask
data, file_path, write_sentences=True): with open(file_path, 'w', encoding='utf-8') as f: written_sum = 0 for batch in predict_result: for line_result in batch: if write_sentences: f.write(data[written_sum][0] + '\n') f.write(str(line_result) + ' \n') written_sum += 1 sentiment_predictor = SentimentPredictor(CONFIG_PATH, CHECKPOINT_PATH, NUM_CLASSES) tokenizer = Tokenizer(DICT_PATH, do_lower_case=True) aim_list = ['motivation', 'experiment', 'readable', 'relatework', 'novel'] def main(aim): original_data = read_excel_data(EXCEL_DATA_PATH, aim) data = MyDataGenerator(original_data, tokenizer, MAX_LEN) sentiment_predictor.load_weights(os.path.join(MODEL_PATH, aim)) predict_result = sentiment_predictor.predict(data) write_result_to_file(predict_result, original_data, os.path.join(RESULT_PATH, aim + '.txt')) if __name__ == '__main__':
#读取schema with open('/home/ycy/HBT/data/schema.json',encoding='utf8') as f: id2predicate, predicate2id, n = {}, {}, 0 predicate2type = {} for l in f: l = json.loads(l) predicate2type[l['predicate']] = (l['subject_type'], l['object_type']) for k, _ in sorted(l['object_type'].items()): key = l['predicate'] + '_' + k id2predicate[n] = key predicate2id[key] = n n += 1 # tokenizer = OurTokenizer(vocab_file=BERT_PATH + "vocab.txt") tokenizer = BertTokenizer.from_pretrained(model_path,do_lower=True) tokenizer_k = Tokenizer(os.path.join(model_path,'vocab.txt'), do_lower_case=True) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。 """ n = len(pattern) for i in range(len(sequence)): if sequence[i:i + n] == pattern: return i return -1 def sequence_padding(inputs, length=None, padding=0): """Numpy函数,将序列padding到同一长度 """ if length is None: length = max([len(x) for x in inputs])
elif this_flag=='O' and last_flag != 'O': d.append([char,'O']) elif this_flag[0] == 'B': d.append([char,this_flag[2:]]) else: d[-1][0] +=char last_flag = this_flag D.append(d) return D train_data = load_data('./data/example.train') valid_data = load_data('./data/example.dev') test_data = load_data('./data/example.test') tokenizer = Tokenizer(token_dict=dict_path,do_lower_case=True) #类别映射 labels = ['PER','LOC','ORG'] id2label = dict(enumerate(labels)) label2id = {j:i for i,j in id2label.items()} num_labels = len(labels)*2 +1 #tag*BI + O class data_generator(DataGenerator): def __iter__(self,random=False): batch_token_ids, batch_labels = [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0] for w, l in item:
def get_tokenizer(dict_path, pre_tokenize=None): """建立分词器 """ return Tokenizer(dict_path, do_lower_case=True, pre_tokenize=pre_tokenize)
elif this_flag[:1] == 'B': d.append([char, this_flag[2:]]) else: d[-1][0] += char last_flag = this_flag D.append(d) return D # 标注数据 train_data = load_data('data/china-people-daily-ner-corpus/example.train') valid_data = load_data('data/china-people-daily-ner-corpus/example.dev') test_data = load_data('data/china-people-daily-ner-corpus/example.test') # 建立分词器 tokenizer = Tokenizer(vocab_path, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_start, batch_end = [], [], [], [] for is_end, item in self.sample(random): for k, v in query_mapping.items(): query_token_ids, query_segment_ids = tokenizer.encode(v) token_ids = query_token_ids.copy() start = query_segment_ids.copy() end = query_segment_ids.copy()
import numpy as np from bert import tokenization from tqdm import tqdm from config import Config import pandas as pd import os from bert4keras.tokenizers import Tokenizer from bert.data_utils import split_text vocab_file = Config().vocab_file do_lower_case = True re_tokenzier = Tokenizer(vocab_file, do_lower_case) config = Config() def load_data(data_file): """ 读取数据 :param file: :return: """ data_df = pd.read_csv(data_file) data_df.fillna('', inplace=True) lines = list( zip(list(data_df['id']), list(data_df['question']), list(data_df['context']), list(data_df['answer']), list(data_df['answer_start']))) return lines def create_example(lines): examples = []
def set_dict_path(self, path): self.dict = path self.tokenizer = Tokenizer(self.dict, do_lower_case=True)
def build_tokenizer(dict_path): '''加载tokenizer''' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 return tokenizer
domain_label2id, domain_id2label, intent_label2id, intent_id2label, slot_label2id, slot_id2label = json.load( open('data/labels.json', 'r', encoding='utf-8')) # 加载数据 data = json.load(open('data/train.json', 'r', encoding='utf-8')) random.shuffle(data) valid_data = data[:len(data) // 8] train_data = data[len(data) // 8:] # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=vocab_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']) tokenizer = Tokenizer(token_dict) # 数据迭代器 class MyDataGenerator(DataGenerator): def __init__(self, data, batch_size=32, buffer_size=None): super(MyDataGenerator, self).__init__(data, batch_size, buffer_size) def __iter__(self, random=False): batch_token_ids, batch_segment_ids, Y1, Y2, Y3 = [], [], [], [], [] for is_end, item in self.sample(random): text = item['text'] token_ids, segment_ids = tokenizer.encode(first_text=text.lower())
def load_pretrain_model(self): """ 加载预训练模型, 和tokenizer """ self.tokenizer = Tokenizer(os.path.join(self.config.bert_dir, self.config.dict_path), do_lower_case=True) # bert-load if self.config.pooling == "pooler": bert = build_transformer_model( os.path.join(self.config.bert_dir, self.config.config_path), os.path.join(self.config.bert_dir, self.config.checkpoint_path), model=self.config.model, with_pool="linear") else: bert = build_transformer_model( os.path.join(self.config.bert_dir, self.config.config_path), os.path.join(self.config.bert_dir, self.config.checkpoint_path), model=self.config.model) # output-layers outputs, count = [], 0 while True: try: output = bert.get_layer("Transformer-%d-FeedForward-Norm" % count).output outputs.append(output) count += 1 except: break # pooling if self.config.pooling == "first-last-avg": outputs = [ NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]] ] outputs = [ keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs ] output = keras.layers.Average()(outputs) elif self.config.pooling == "first-last-max": outputs = [ NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]] ] outputs = [keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs] output = keras.layers.Average()(outputs) elif self.config.pooling == "cls-max-avg": outputs = [ NonMaskingLayer()(output_i) for output_i in [outputs[0], outputs[-1]] ] outputs_cls = [ keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in outputs ] outputs_max = [ keras.layers.GlobalMaxPooling1D()(fs) for fs in outputs ] outputs_avg = [ keras.layers.GlobalAveragePooling1D()(fs) for fs in outputs ] output = keras.layers.Concatenate()(outputs_cls + outputs_avg) elif self.config.pooling == "last-avg": output = keras.layers.GlobalAveragePooling1D()(outputs[-1]) elif self.config.pooling == "cls-3": outputs = [ keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in [outputs[0], outputs[-1], outputs[-2]] ] output = keras.layers.Concatenate()(outputs) elif self.config.pooling == "cls-2": outputs = [ keras.layers.Lambda(lambda x: x[:, 0])(fs) for fs in [outputs[0], outputs[-1]] ] output = keras.layers.Concatenate()(outputs) elif self.config.pooling == "cls-1": output = keras.layers.Lambda(lambda x: x[:, 0])(outputs[-1]) elif self.config.pooling == "pooler": output = bert.output # 加载句FAQ标准问的句向量, 并当成一个常量参与余弦相似度的计算 docs_encode = np.loadtxt( os.path.join(self.config.save_dir, self.config.path_docs_encode)) # 余弦相似度的层 score_cosine = CosineLayer(docs_encode)(output) # 最后的编码器 self.bert_white_encoder = Model(bert.inputs, score_cosine) print("load bert_white_encoder success!")
from window_layers import WindowEmbedding, WindowEmbeddingforword epochs = 500 batch_size = 1024 # bert_layers = 12 learing_rate = 1e-4 # bert_layers越小,学习率应该要越大 seq_crf_lr_multiplier = 1e-2 # 必要时扩大CRF层的学习率 tag_crf_lr_multiplier = 1e-2 vocab_size = 21128 # bert配置 # config_path = '../../Q_A/publish/bert_config.json' # checkpoint_path = '../../Q_A/publish/bert_model.ckpt' dict_path = '../../Q_A/publish/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=False) labels = [ 'O', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R20', 'R21', 'R22', 'R23', 'R24', 'R25', 'R30', 'R31', 'R90', 'R99', 'X' ] seg_labels = ['O', 'B', 'I', 'E'] id2label = dict(enumerate(labels)) label2id = {j: i for i, j in id2label.items()} num_labels = len(labels) id2seglabel = dict(enumerate(seg_labels)) seglabel2id = {j: i for i, j in id2seglabel.items()} num_seglabels = len(seg_labels)
compound_tokens = [] for t, i in sorted(token_dict.items(), key=lambda s: s[1]): # 这里主要考虑两种情况:1、首字母大写;2、整个单词大写。 # Python2下,新增了5594个token;Python3下,新增了5596个token。 tokens = [] if t.isalpha(): tokens.extend([t[:1].upper() + t[1:], t.upper()]) elif t[:2] == '##' and t[2:].isalpha(): tokens.append(t.upper()) for token in tokens: if token not in new_token_dict: compound_tokens.append([i]) new_token_dict[token] = len(new_token_dict) tokenizer = Tokenizer(new_token_dict, do_lower_case=False) model = build_transformer_model( config_path, checkpoint_path, compound_tokens=compound_tokens, # 增加新token,用旧token平均来初始化 ) text = u'Welcome to BEIJING.' tokens = tokenizer.tokenize(text) print(tokens) """ 输出:['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]'] """ token_ids, segment_ids = tokenizer.encode(text)
import jieba from roformer import RoFormerTokenizer from bert4keras.tokenizers import Tokenizer dict_path = 'pretrained_models/chinese_roformer_base' text = "12312格ab局A B cdA,.567 861351 684!今天萨达天 气非常好王企。文保鹅按时发放了的撒这些seqetvgsa国内拉手的喀什。、]P[,./()*7656&【;,‘" #text = "这里基本保留了唐宋遗留下来的坊巷格局和大量明清古建筑,其中各级文保单位29处,被誉为“里坊制度的活化石”“明清建筑博物馆”!" bert4keras_tokenizer = Tokenizer( dict_path + "/vocab.txt", do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) roformer_tokenizer = RoFormerTokenizer.from_pretrained(dict_path) bert4keras_tokenizer_input_ids = bert4keras_tokenizer.encode(text)[0] roformer_tokenizer_input_ids = roformer_tokenizer.encode(text) print(bert4keras_tokenizer_input_ids == roformer_tokenizer_input_ids)
D = [] with open(filename, encoding='utf-8') as f: for l in f: text, label = l.strip().split('\t') D.append((text[:128], int(label))) return D # 加载数据集 train_data = load_data('datasets/sentiment/sentiment.train.data') valid_data = load_data('datasets/sentiment/sentiment.valid.data') test_data = load_data('datasets/sentiment/sentiment.test.data') # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids)
__, last_part = line.split(':') ignore_flag = False for dis_word in disallowed_words: if dis_word in last_part: ignore_flag = True break if ignore_flag: continue # 长度不能超过最大长度 if len(last_part) > max_len - 2: continue poetry.append(last_part) # 预训练模型中的词典和分词器 _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(dict_path, do_lower_case=True) # 统计所有词的词频 word_frequency_count = defaultdict(int) for line in poetry: for t in _tokenizer.tokenize(line): word_frequency_count[t] += 1 # 过滤掉低频词 tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency] # 按词频排序 tokens = sorted(tokens, key=lambda x: -x[1]) # 去掉词频,只保留词列表 tokens = [token for token, count in tokens] # 构建新的token->id映射关系、和新词表
valid_data = load_data('dev_data/balanced_dev.json') # 读取schema 事件模式 with open('event_schema/event_schema.json') as f: id2label, label2id, n = {}, {}, 0 for l in f: l = json.loads(l) for role in l['role_list']: key = (l['event_type'], role['role']) id2label[n] = key label2id[key] = n n += 1 num_labels = len(id2label) * 2 + 1 # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。 """ n = len(pattern) for i in range(len(sequence)): if sequence[i:i + n] == pattern: return i return -1 class data_generator(DataGenerator): """数据生成器
def setUpClass(cls) -> None: cls.my_tokenizer = BertTokenizer(dict_path, ignore_case=True) cls.sjl_tokenizer = Tokenizer(dict_path, do_lower_case=True)