class Processor(object): def __init__(self, train_path, token_dict): self.train_path = train_path self.tokenizer = Tokenizer(token_dict) def get_tags(self): tags = set() train_data = self.get_data(self.train_path) for item in train_data: for tag in item[1].split(" "): tags.add(tag) # PAD-tag用X self.tag2id = {list(tags)[i]: i for i in range(len(tags))} self.tag2id["X"] = len(self.tag2id) self.id2tag = {self.tag2id[i]: i for i in self.tag2id} return self.tag2id, self.id2tag def get_data(self, path): with codecs.open(path, "r", encoding="utf-8") as f: data = json.load(f) return data def get_bert_inputs(self, path, max_len): srcs = self.get_data(path) src_data, src_tags = [], [] for item in srcs: src_data.append(item[0]) src_tags.append(item[1]) tokens, segs, tags = [], [], [] for item in src_data: res = self.tokenizer.encode(item, first_length=max_len) tokens.append(np.array(res[0])) segs.append(np.array(res[1])) max_len -= 2 for item in src_tags: len_item = len(item.split(" ")) if len_item >= max_len: tags.append(["X"] + item.split(" ")[:max_len] + ["X"]) else: tags.append(["X"] + item.split(" ") + ["X"] * (max_len - len_item + 1)) tags = [[self.tag2id[item] for item in term[1:]] for term in tags] tags = np.expand_dims(tags, axis=-1) return tokens, segs, tags
# 测试代码可用性: 提取特征 from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer from keras.models import load_model import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636 0.39056838]
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer, load_vocab import numpy as np config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = Tokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
dict_path = 'albert_tiny_zh_google/vocab.txt' tokenizer = Tokenizer(dict_path) # 加载预训练模型 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, albert=True, return_keras_model=False, ) model = Model(bert.model.input, bert.model.output) token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京') token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港') token_ids3, segment_ids3 = tokenizer.encode(u'目前的局势,止暴制乱,刻不容缓') sentence_vec1 = model.predict( [np.array([token_ids1]), np.array([segment_ids1])])[0] sentence_vec2 = model.predict( [np.array([token_ids2]), np.array([segment_ids2])])[0] sentence_vec3 = model.predict( [np.array([token_ids3]), np.array([segment_ids3])])[0] print("《我想去北京》和《我想去香港》的余弦距离为%f" % similarity_count(sentence_vec1, sentence_vec2)) print("《我想去北京》和《我想去香港》的欧式距离为%f" % similarity_count(sentence_vec1, sentence_vec2, mode='eu'))
dict_path = 'albert_tiny_zh_google/vocab.txt' tokenizer = Tokenizer(dict_path) # 加载预训练模型 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, albert=True, return_keras_model=False, ) model = Model(bert.model.input, bert.model.output) token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京逛一逛天安门') token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港') token_ids3, segment_ids3 = tokenizer.encode(u'我想到天安门广场走一走') sentence_vec1 = model.predict( [np.array([token_ids1]), np.array([segment_ids1])])[0] sentence_vec2 = model.predict( [np.array([token_ids2]), np.array([segment_ids2])])[0] sentence_vec3 = model.predict( [np.array([token_ids3]), np.array([segment_ids3])])[0] print("《我想去北京》和《我想去香港》的余弦距离为%f" % similarity_count(sentence_vec1, sentence_vec2)) print("《我想去北京》和《我想去香港》的欧式距离为%f" % similarity_count(sentence_vec1, sentence_vec2, mode='eu'))