class MaskedLM(object): def __init__(self, topK): self.topK = topK self.tokenizer = Tokenizer(BERT_VOCAB_PATH, do_lower_case='True') self.model = build_transformer_model(BERT_CONFIG_PATH, BERT_CHECKPOINT_PATH, with_mlm=True) def tokenizer_text(self, text): # ['[CLS]', '我', '喜', '欢', '吃', '程', '度', '的', '火', '锅', '[SEP]'] self.toeken = self.tokenizer.tokenize(text) # [101, 2769, 1599, 3614, 1391, 4923, 2428, 4638, 4125, 7222, 102] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.token_ids, self.segment_ids = self.tokenizer.encode(text) def find_top_candidates(self, error_index): for i in error_index: # 将错误词的id换成[MASK]的id self.token_ids[i] = self.tokenizer._token_dict['[MASK]'] # 第 5,6 个位置被替换为mask的ID-103,[101, 2769, 1599, 3614, 1391, 103, 103, 4638, 4125, 7222, 102] # 预测每一个token的概率分布 probs.shape = [len(toekn_ids),vocab_size] probs = self.model.predict( [np.array([self.token_ids]), np.array([self.segment_ids])])[0] for i in range(len(error_index)): # 拿到error_id error_id = error_index[i] # 取出概率分布里面,概率最大的topK个的位置id,argsort是升序,取负之后倒序 top_k_probs = np.argsort(-probs[error_id])[:self.topK] candidates, find_prob = self.tokenizer.decode( top_k_probs), probs[error_id][top_k_probs] print(dict(zip(candidates, find_prob)))
def create_tokenizer(sentences: typing.List[str]) -> typing.Tuple[Tokenizer, typing.List]: """ 根据新的数据集,精简词表,重新创建tokenizer Args: sentences: 评论数据句子的列表 Returns: tokenizer,keep_tokens """ # 加载下载的词表 _token_dict = load_vocab(settings.DICT_PATH) _tokenizer = Tokenizer(_token_dict, do_lower_case=True) # 统计词频 counter = Counter() for sentence in sentences: _tokens = _tokenizer.tokenize(sentence) # 统计词频时,移除[CLS]和[SEP]字符 counter.update(_tokens[1:-1]) # 过滤低频词 tokens_and_counts = [(token, count) for token, count in counter.items() if count >= settings.MIN_WORD_FREQUENCY] # 按词频倒序排列 sorted_tokens_and_counts = sorted(tokens_and_counts, key=lambda x: -x[1]) # 去掉词频,只保留token most_tokens = [token for token, count in sorted_tokens_and_counts] # 构建新词典 tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] + most_tokens keep_tokens = [] token_dict = {} for token in tokens: if token in _token_dict: token_dict[token] = len(token_dict) keep_tokens.append(_token_dict[token]) # 使用新词典构建分词器 tokenizer = Tokenizer(token_dict, do_lower_case=True) return tokenizer, keep_tokens
break if ignore_flag: continue # 长度不能超过最大长度 if len(last_part) > max_len - 2: continue poetry.append(last_part) # 预训练模型中的词典和分词器 _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(dict_path, do_lower_case=True) # 统计所有词的词频 word_frequency_count = defaultdict(int) for line in poetry: for t in _tokenizer.tokenize(line): word_frequency_count[t] += 1 # 过滤掉低频词 tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency] # 按词频排序 tokens = sorted(tokens, key=lambda x: -x[1]) # 去掉词频,只保留词列表 tokens = [token for token, count in tokens] # 构建新的token->id映射关系、和新词表 token_id_dict = {} keep_words = [] # 将特殊词加入到词典中 for token in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_id_dict[token] = len(token_id_dict)
class ReextractBertTrainHandler(): def __init__(self, params, Train=False): self.bert_config_path = model_root_path + "chinese_L-12_H-768_A-12/bert_config.json" self.bert_checkpoint_path = model_root_path + "chinese_L-12_H-768_A-12/bert_model.ckpt" self.bert_vocab_path = model_root_path + "chinese_L-12_H-768_A-12/vocab.txt" self.tokenizer = Tokenizer(self.bert_vocab_path, do_lower_case=True) self.model_path = model_root_path + "best_model.weights" self.params_path = model_root_path + 'params.json' gpu_id = params.get("gpu_id", None) self._set_gpu_id(gpu_id) # 设置训练的GPU_ID self.memory_fraction = params.get('memory_fraction') if Train: self.train_data_file_path = params.get('train_data_path') self.valid_data_file_path = params.get('valid_data_path') self.maxlen = params.get('maxlen', 128) self.batch_size = params.get('batch_size', 32) self.epoch = params.get('epoch') self.data_process() else: load_params = json.load(open(self.params_path, encoding='utf-8')) self.maxlen = load_params.get('maxlen') self.num_classes = load_params.get('num_classes') self.p2s_dict = load_params.get('p2s_dict') self.i2p_dict = load_params.get('i2p_dict') self.p2o_dict = load_params.get('p2o_dict') self.build_model() if not Train: self.load_model() def _set_gpu_id(self, gpu_id): if gpu_id: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) def data_process(self): self.train_data, self.valid_data, self.p2s_dict, self.p2o_dict, self.i2p_dict, self.p2i_dict = data_process( self.train_data_file_path, self.valid_data_file_path, self.maxlen, self.params_path) self.num_classes = len(self.i2p_dict) self.train_generator = Data_Generator(self.train_data, self.batch_size, self.tokenizer, self.p2i_dict, self.maxlen) def extrac_subject(self, inputs): """根据subject_ids从output中取出subject的向量表征 """ output, subject_ids = inputs subject_ids = K.cast(subject_ids, 'int32') start = batch_gather(output, subject_ids[:, :1]) end = batch_gather(output, subject_ids[:, 1:]) subject = K.concatenate([start, end], 2) return subject[:, 0] def build_model(self): import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' # A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc. if self.memory_fraction: config.gpu_options.per_process_gpu_memory_fraction = self.memory_fraction config.gpu_options.allow_growth = False else: config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # 补充输入 subject_labels = Input(shape=(None, 2), name='Subject-Labels') subject_ids = Input(shape=(2,), name='Subject-Ids') object_labels = Input(shape=(None, self.num_classes, 2), name='Object-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=self.bert_config_path, checkpoint_path=self.bert_checkpoint_path, return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x ** 2)(output) self.subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(self.extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=self.num_classes * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x ** 4)(output) object_preds = Reshape((-1, self.num_classes, 2))(output) self.object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 self.train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) self.train_model.add_loss(subject_loss + object_loss) AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') self.optimizer = AdamEMA(lr=1e-4) self.train_model.compile(optimizer=self.optimizer) def load_model(self): self.train_model.load_weights(self.model_path) def predict(self, text): """ 抽取输入text所包含的三元组 text:str(<离开>是由张宇谱曲,演唱) """ tokens = self.tokenizer.tokenize(text, max_length=self.maxlen) token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.maxlen) # 抽取subject subject_preds = self.subject_model.predict([[token_ids], [segment_ids]]) start = np.where(subject_preds[0, :, 0] > 0.6)[0] end = np.where(subject_preds[0, :, 1] > 0.5)[0] subjects = [] for i in start: j = end[end >= i] if len(j) > 0: j = j[0] subjects.append((i, j)) if subjects: spoes = [] token_ids = np.repeat([token_ids], len(subjects), 0) segment_ids = np.repeat([segment_ids], len(subjects), 0) subjects = np.array(subjects) # 传入subject,抽取object和predicate object_preds = self.object_model.predict([token_ids, segment_ids, subjects]) for subject, object_pred in zip(subjects, object_preds): start = np.where(object_pred[:, :, 0] > 0.6) end = np.where(object_pred[:, :, 1] > 0.5) for _start, predicate1 in zip(*start): for _end, predicate2 in zip(*end): if _start <= _end and predicate1 == predicate2: spoes.append((subject, predicate1, (_start, _end))) break return [ ( [self.tokenizer.decode(token_ids[0, s[0]:s[1] + 1], tokens[s[0]:s[1] + 1]), self.p2s_dict[self.i2p_dict[p]]], self.i2p_dict[p], [self.tokenizer.decode(token_ids[0, o[0]:o[1] + 1], tokens[o[0]:o[1] + 1]), self.p2o_dict[self.i2p_dict[p]]], (s[0], s[1] + 1), (o[0], o[1] + 1) ) for s, p, o in spoes ] else: return [] def train(self): evaluator = Evaluator(self.train_model, self.model_path, self.tokenizer, self.predict, self.optimizer, self.valid_data) self.train_model.fit_generator(self.train_generator.forfit(), steps_per_epoch=len(self.train_generator), epochs=self.epoch, callbacks=[evaluator])
tokens.append(t.upper()) for token in tokens: if token not in new_token_dict: compound_tokens.append([i]) new_token_dict[token] = len(new_token_dict) tokenizer = Tokenizer(new_token_dict, do_lower_case=False) model = build_transformer_model( config_path, checkpoint_path, compound_tokens=compound_tokens, # 增加新token,用旧token平均来初始化 ) text = u'Welcome to BEIJING.' tokens = tokenizer.tokenize(text) print(tokens) """ 输出:['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]'] """ token_ids, segment_ids = tokenizer.encode(text) token_ids, segment_ids = to_array([token_ids], [segment_ids]) print(model.predict([token_ids, segment_ids])) """ 输出: [[[-1.4999904e-01 1.9651388e-01 -1.7924258e-01 ... 7.8269649e-01 2.2241375e-01 1.1325148e-01] [-4.5268752e-02 5.5090344e-01 7.4699545e-01 ... -4.7773960e-01 -1.7562288e-01 4.1265407e-01] [ 7.0158571e-02 1.7816302e-01 3.6949167e-01 ... 9.6258509e-01
class AlbertNerModel(object): # model=None def __init__(self, model_name: str, path: str, config_path: str, checkpoint_path: str, dict_path: str, layers: int = 0, unshared: bool = False): """ Albert 初始化参数 :param model_name: 模型名称,albert_base/albert_small/albert_tiny, 不推荐albertbase/albertsmall/alberttiny :param path: 权重路径 :param config_path: 预训练模型配置文件 :param checkpoint_path: 预训练模型文件 :param dict_path: 预训练模型字典 :param layers: 可选自定义层数,base最大12层,small最大6层,tiny最大4层 :param unshared: 是否以Bert形式做层分解,默认为否 """ if tf.__version__ >= '2.0': print('暂不支持tensorflow 2.0 以上版本') raise self.weight_path = path self.__maxlen = 256 self.__crf_lr_multiplier = 1000 if str(model_name).upper() == 'ALBERT_BASE' or str( model_name).upper() == 'ALBERTBASE': self.albert_layers = 12 elif str(model_name).upper() == 'ALBERT_SMALL' or str( model_name).upper() == 'ALBERTSMALL': self.albert_layers = 6 elif str(model_name).upper() == 'ALBERT_TINY' or str( model_name).upper() == 'ALBERTTINY': self.albert_layers = 4 if layers > 0: self.albert_layers = layers self.pretrain_name = model_name self.config = config_path self.checkpoint = checkpoint_path self.dict = dict_path self.unshared = unshared self.tokenizer = Tokenizer(self.dict, do_lower_case=True) # 类别映射 labels = ['PER', 'LOC', 'ORG'] id2label = dict(enumerate(labels)) # label2id={j: i for i,j in id2label.items()} self.__id2label = id2label self.__num_labels = len(labels) * 2 + 1 # label2id = {j: i for i, j in id2label.items()} assert self.config and self.checkpoint and self.dict # self.__crf= ConditionalRandomField(lr_multiplier=self.crf_lr_multiplier) self.__crf = None self._model = None # region 为便于多模型配置调试,对所有配置参数做setter处理,配置完毕需要重新build model def set_layers(self, value): self.albert_layers = value def set_unshared(self, value): self.unshared = value def set_dict_path(self, path): self.dict = path self.tokenizer = Tokenizer(self.dict, do_lower_case=True) def set_checkpoint_path(self, path): self.checkpoint = path def set_config_path(self, path): self.config = path def set_weight_path(self, weight_path): self.weight_path = weight_path # endregion @property def maxlen(self): return self.__maxlen @maxlen.setter def maxlen(self, value): self.__maxlen = value @property def crf_lr_multiplier(self): return self.__crf_lr_multiplier @crf_lr_multiplier.setter def crf_lr_multiplier(self, value): self.__crf_lr_multiplier = value @property def albert_model(self): return self._model @albert_model.setter def albert_model(self, model_path: str): from keras.models import load_model from keras.utils import CustomObjectScope # self.__model=load_model(model_path,custom_objects={'ConditionalRandomField': # ConditionalRandomField, # 'sparse_loss':ConditionalRandomField.sparse_loss}, # compile=False)##两种自定义loss加载方式均可 with CustomObjectScope({ 'ConditionalRandomField': ConditionalRandomField, 'sparse_loss': ConditionalRandomField.sparse_loss }): self._model = load_model(model_path) ##此处是重点!!,本机电脑及服务器上model中crf层名字如下,实际情况若名称不一致,需根据模型拓扑结构中的名字更改!!! self.__crf = self._model.get_layer('conditional_random_field_1') assert isinstance(self.__crf, ConditionalRandomField) @albert_model.deleter def albert_model(self): K.clear_session() del self._model def build_albert_model(self): del self.albert_model file_name = f'albert_{self.pretrain_name}_pretrain.h5' ##这里,为了方便预训练模型加载,我预先将加载后的预训练模型保存为了.h5 if os.path.exists(file_name): pretrain_model = load_model(file_name, compile=False) else: pretrain_model = build_transformer_model( config_path=self.config, checkpoint_path=self.checkpoint, model='albert_unshared' if self.unshared else 'albert', return_keras_model=True) if not self.unshared: output_layer = 'Transformer-FeedForward-Norm' output = pretrain_model.get_layer(output_layer).get_output_at( self.albert_layers - 1) else: output_layer = 'Transformer-%s-FeedForward-Norm' % ( self.albert_layers - 1) output = pretrain_model.get_layer(output_layer).output output = Dense(self.__num_labels)(output) self.__crf = ConditionalRandomField( lr_multiplier=self.crf_lr_multiplier) output = self.__crf(output) model = Model(pretrain_model.input, output) model.load_weights(self.weight_path) self._model = model def viterbi_decode(self, nodes, trans, starts=[0], ends=[0]): """Viterbi算法求最优路径 """ num_labels = len(trans) non_starts = [] non_ends = [] if starts is not None: for i in range(num_labels): if i not in starts: non_starts.append(i) if ends is not None: for i in range(num_labels): if i not in ends: non_ends.append(i) # 预处理 nodes[0, non_starts] -= np.inf nodes[-1, non_ends] -= np.inf labels = np.arange(num_labels).reshape((1, -1)) scores = nodes[0].reshape((-1, 1)) # scores[1:] -= np.inf # 第一个标签必然是0 paths = labels for l in range(1, len(nodes)): M = scores + trans + nodes[l].reshape((1, -1)) idxs = M.argmax(0) scores = M.max(0).reshape((-1, 1)) paths = np.concatenate([paths[:, idxs], labels], 0) return paths[:, scores[:, 0].argmax()] # 最优路径 def recognize(self, text): """ # 识别实体 :param text: :return: entities list """ tokens = self.tokenizer.tokenize(text) while len(tokens) > 512: tokens.pop(-2) try: mapping = self.tokenizer.rematch(text, tokens) token_ids = self.tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) nodes = self._model.predict([[token_ids], [segment_ids]])[0] # print('nodes:',nodes) _trans = K.eval(self.__crf.trans) labels = self.viterbi_decode(nodes, trans=_trans) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], self.__id2label[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities] except: import traceback traceback.print_exc()
rlist = [] ip = 0 for it, iis in enumerate(otiis): while ip + 1 < len(wdiis) and wdiis[ip + 1] <= iis: ip += 1 if iis == wdiis[ip]: rr = 'B' elif iis > wdiis[ip]: rr = 'I' rr += '-' + pos_list[ip] rlist.append(rr) #for rr, tt in zip(rlist, token_list): print(rr, tt) return rlist def normalize_sentence(text): text = re.sub('[“”]', '"', text) text = re.sub('[—]', '-', text) text = re.sub('[^\u0000-\u007f\u4e00-\u9fa5\u3001-\u303f\uff00-\uffef·—]', ' \u2800 ', text) return text if __name__ == '__main__': from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm') sent = '6月13日起,法国ARTE频道将推出一部12集的新迷你剧《奥德修斯》(Odysseus),是编剧Frédéric Azémar用更自由的视角对荷马史诗的一次改编和延续' tokens = tokenizer.tokenize(sent) otokens = restore_token_list(sent, tokens) print(gen_token_list_inv_pointer(sent, tokens)) print(tokens) print(otokens) print('done')