def build_trained_model(args): if args.device_map != "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = args.device_map else: os.environ["CUDA_VISIBLE_DEVICES"] = "" token_dict = {} with codecs.open(args.bert_vocab, "r", encoding="utf-8") as f: for line in f: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) with codecs.open(os.path.join(args.model_path, "tag2id.pkl"), "rb") as f: tag2id = pickle.load(f) with codecs.open(os.path.join(args.model_path, "id2tag.pkl"), "rb") as f: id2tag = pickle.load(f) mask_tag = "X" crf_loss = CRF_Loss(tag2id=tag2id, mask_tag=mask_tag).crf_loss crf_accuracy = CRF_Accuracy(tag2id=tag2id, mask_tag=mask_tag).crf_accuracy custom_objects["CRF"] = CRF custom_objects["crf_loss"] = crf_loss custom_objects["crf_accuracy"] = crf_accuracy model = load_model(os.path.join(args.model_path, args.model_name), custom_objects=custom_objects) viterbi_decoder = Viterbi(model, len(id2tag)) return tokenizer, id2tag, viterbi_decoder
def process_data(data_file='./data/classify_data.txt'): with open(data_file, encoding='utf-8')as f: datas = f.readlines() chars = set() labels = set() new_datas = [] for data in datas: data = data.strip() if not data: continue text, label = data.rsplit(maxsplit=1) chars.update(set(text)) labels.add(label) new_datas.append([text, label]) del datas label2id = {lab: i for i, lab in enumerate(list(labels))} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = Tokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(label2id, f) if not os.path.exists('./random_order.json'): random_order = [i for i in range(len(new_datas))] random.shuffle(random_order) json.dump( random_order, open('./random_order.json', 'w'), indent=4 ) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 == 0] return train_data, valid_data, tokenizer, keep_words, label2id
class Processor(object): def __init__(self, train_path, token_dict): self.train_path = train_path self.tokenizer = Tokenizer(token_dict) def get_tags(self): tags = set() train_data = self.get_data(self.train_path) for item in train_data: for tag in item[1].split(" "): tags.add(tag) # PAD-tag用X self.tag2id = {list(tags)[i]: i for i in range(len(tags))} self.tag2id["X"] = len(self.tag2id) self.id2tag = {self.tag2id[i]: i for i in self.tag2id} return self.tag2id, self.id2tag def get_data(self, path): with codecs.open(path, "r", encoding="utf-8") as f: data = json.load(f) return data def get_bert_inputs(self, path, max_len): srcs = self.get_data(path) src_data, src_tags = [], [] for item in srcs: src_data.append(item[0]) src_tags.append(item[1]) tokens, segs, tags = [], [], [] for item in src_data: res = self.tokenizer.encode(item, first_length=max_len) tokens.append(np.array(res[0])) segs.append(np.array(res[1])) max_len -= 2 for item in src_tags: len_item = len(item.split(" ")) if len_item >= max_len: tags.append(["X"] + item.split(" ")[:max_len] + ["X"]) else: tags.append(["X"] + item.split(" ") + ["X"] * (max_len - len_item + 1)) tags = [[self.tag2id[item] for item in term[1:]] for term in tags] tags = np.expand_dims(tags, axis=-1) return tokens, segs, tags
def load_data(filename): D = [] with codecs.open(filename, encoding='utf-8') as f: for l in f: text1, text2, label = l.strip().split('\t') D.append((text1, text2, int(label))) return D # 加载数据集 train_data = load_data('datasets/lcqmc/lcqmc.train.data') valid_data = load_data('datasets/lcqmc/lcqmc.valid.data') test_data = load_data('datasets/lcqmc/lcqmc.test.data') # 建立分词器 tokenizer = Tokenizer(dict_path) def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) class data_generator: def __init__(self, data, batch_size=64): self.data = data self.batch_size = batch_size
def read_texts(): txts = glob.glob('../../thuctc/THUCNews/*/*.txt') np.random.shuffle(txts) for txt in txts: d = open(txt).read() d = d.decode('utf-8').replace(u'\u3000', ' ') d = d.split('\n') if len(d) > 1: title = d[0].strip() content = '\n'.join(d[1:]).strip() if len(title) <= max_output_len: yield content[:max_input_len], title _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 if os.path.exists(seq2seq_config): tokens = json.load(open(seq2seq_config)) else: def _batch_texts(): texts = [] for text in read_texts(): texts.extend(text) if len(texts) == 1000: yield texts texts = [] if texts:
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer from keras.models import load_model import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636
def get_tokenizer(cls,): if cls.tokenizer is not None: return cls.tokenizer else: cls.tokenizer = Tokenizer(cls.get_token_dict()[0]) return cls.tokenizer
from bert4keras.bert import load_pretrained_model, set_gelu from bert4keras.utils import Tokenizer, load_vocab from bert4keras.train import PiecewiseLinearLearningRate set_gelu('tanh') # 切换gelu版本 maxlen = 100 config_path = '/root/kg/bert/albert_base_zh/bert_config.json' checkpoint_path = '/root/kg/bert/albert_base_zh/bert_model.ckpt' dict_path = '/root/kg/bert/albert_base_zh/vocab.txt' neg = pd.read_excel('datasets/neg.xls', header=None) pos = pd.read_excel('datasets/pos.xls', header=None) data, tokens = [], {} _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 for d in neg[0]: data.append((d, 0)) for t in _tokenizer.tokenize(d): tokens[t] = tokens.get(t, 0) + 1 for d in pos[0]: data.append((d, 1)) for t in _tokenizer.tokenize(d): tokens[t] = tokens.get(t, 0) + 1 tokens = {i: j for i, j in tokens.items() if j >= 4} token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer, load_vocab import numpy as np config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = Tokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
import numpy as np def similarity_count(vec1, vec2, mode='cos'): if mode == 'eu': return euclidean_distances([vec1, vec2])[0][1] if mode == 'cos': return cosine_similarity([vec1, vec2])[0][1] maxlen = 128 config_path = 'albert_tiny_zh_google/albert_config_tiny_g.json' checkpoint_path = 'albert_tiny_zh_google/albert_model.ckpt' dict_path = 'albert_tiny_zh_google/vocab.txt' tokenizer = Tokenizer(dict_path) # 加载预训练模型 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, albert=True, return_keras_model=False, ) model = Model(bert.model.input, bert.model.output) token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京') token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港') token_ids3, segment_ids3 = tokenizer.encode(u'目前的局势,止暴制乱,刻不容缓')
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer, load_vocab from keras.models import load_model import numpy as np config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = Tokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899]
import numpy as np def similarity_count(vec1, vec2, mode='cos'): if mode == 'eu': return euclidean_distances([vec1, vec2])[0][1] if mode == 'cos': return cosine_similarity([vec1, vec2])[0][1] maxlen = 128 config_path = 'albert_tiny_zh_google/albert_config_tiny_g.json' checkpoint_path = 'albert_tiny_zh_google/albert_model.ckpt' dict_path = 'albert_tiny_zh_google/vocab.txt' tokenizer = Tokenizer(dict_path) # 加载预训练模型 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, albert=True, return_keras_model=False, ) model = Model(bert.model.input, bert.model.output) token_ids1, segment_ids1 = tokenizer.encode(u'我想去北京逛一逛天安门') token_ids2, segment_ids2 = tokenizer.encode(u'我想去香港') token_ids3, segment_ids3 = tokenizer.encode(u'我想到天安门广场走一走')
# 所以只需要看第一个,不需要遍历后面的。 if i == 0 and j > 0: continue for k in _topk_arg[j]: _candidate_ids.append(ids + [k + 3]) _candidate_scores.append(sco + _log_probas[j][k]) _topk_arg = np.argsort(_candidate_scores)[-topk:] # 从中选出新的topk target_ids = [_candidate_ids[k] for k in _topk_arg] target_scores = [_candidate_scores[k] for k in _topk_arg] best_one = np.argmax(target_scores) if target_ids[best_one][-1] == self.token_dict.get("[SEP]"): return self.tokenizer.decode(target_ids[best_one]) # 如果max_output_len字都找不到结束符,直接返回 return self.tokenizer.decode(target_ids[np.argmax(target_scores)]) def get_token_dict(token_file): with open(token_file, "r") as f: token_list = f.readlines() token_dict = {word.strip(): id_ for id_, word in enumerate(token_list)} return token_dict if __name__ == "__main__": dict_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt' token_dict = get_token_dict(dict_path) tokenizer = Tokenizer(token_dict) seq_model = trans_infer(tokenizer, token_dict) # ans = seq_model.gen_trans(input_.lower(), topk) print(seq_model.gen_trans("NLP简直太神奇了".lower(), 2))
if c in self._token_dict: R.append(c) elif self._is_space(c): R.append('[unused1]') # space类用未经训练的[unused1]表示 else: R.append('[UNK]') # 剩余的字符是[UNK] return R dict_path = '/opt/developer/wp/wzcq/roberta_wwm/vocab.txt' token_dict = get_token_dict(dict_path) tokenizer = OurTokenizer(token_dict) trans_dic_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt' token_dict_trans = get_token_dict(trans_dic_path) trans_tokenizer = Tokenizer(token_dict_trans) @app.route('/') def hello_world(): data = {} return render_template("ci.html", **data) @app.route('/mc') def machine_read(): return render_template('mc.html') @app.route('/ci') def generate_ci(): return render_template('ci.html') @app.route('/trans')
def __init__(self, train_path, token_dict): self.train_path = train_path self.tokenizer = Tokenizer(token_dict)