def build_trained_model(args): if args.device_map != "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = args.device_map else: os.environ["CUDA_VISIBLE_DEVICES"] = "" token_dict = {} with codecs.open(args.bert_vocab, "r", encoding="utf-8") as f: for line in f: token = line.strip() token_dict[token] = len(token_dict) tokenizer = Tokenizer(token_dict) with codecs.open(os.path.join(args.model_path, "tag2id.pkl"), "rb") as f: tag2id = pickle.load(f) with codecs.open(os.path.join(args.model_path, "id2tag.pkl"), "rb") as f: id2tag = pickle.load(f) mask_tag = "X" crf_loss = CRF_Loss(tag2id=tag2id, mask_tag=mask_tag).crf_loss crf_accuracy = CRF_Accuracy(tag2id=tag2id, mask_tag=mask_tag).crf_accuracy custom_objects["CRF"] = CRF custom_objects["crf_loss"] = crf_loss custom_objects["crf_accuracy"] = crf_accuracy model = load_model(os.path.join(args.model_path, args.model_name), custom_objects=custom_objects) viterbi_decoder = Viterbi(model, len(id2tag)) return tokenizer, id2tag, viterbi_decoder
def process_data(data_file='./data/classify_data.txt'): with open(data_file, encoding='utf-8')as f: datas = f.readlines() chars = set() labels = set() new_datas = [] for data in datas: data = data.strip() if not data: continue text, label = data.rsplit(maxsplit=1) chars.update(set(text)) labels.add(label) new_datas.append([text, label]) del datas label2id = {lab: i for i, lab in enumerate(list(labels))} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = Tokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(label2id, f) if not os.path.exists('./random_order.json'): random_order = [i for i in range(len(new_datas))] random.shuffle(random_order) json.dump( random_order, open('./random_order.json', 'w'), indent=4 ) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [new_datas[j] for i, j in enumerate(random_order) if i % 10 == 0] return train_data, valid_data, tokenizer, keep_words, label2id
def load_data(filename): D = [] with codecs.open(filename, encoding='utf-8') as f: for l in f: text1, text2, label = l.strip().split('\t') D.append((text1, text2, int(label))) return D # 加载数据集 train_data = load_data('datasets/lcqmc/lcqmc.train.data') valid_data = load_data('datasets/lcqmc/lcqmc.valid.data') test_data = load_data('datasets/lcqmc/lcqmc.test.data') # 建立分词器 tokenizer = Tokenizer(dict_path) def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) class data_generator: def __init__(self, data, batch_size=64): self.data = data self.batch_size = batch_size
def read_texts(): txts = glob.glob('../../thuctc/THUCNews/*/*.txt') np.random.shuffle(txts) for txt in txts: d = open(txt).read() d = d.decode('utf-8').replace(u'\u3000', ' ') d = d.split('\n') if len(d) > 1: title = d[0].strip() content = '\n'.join(d[1:]).strip() if len(title) <= max_output_len: yield content[:max_input_len], title _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 if os.path.exists(seq2seq_config): tokens = json.load(open(seq2seq_config)) else: def _batch_texts(): texts = [] for text in read_texts(): texts.extend(text) if len(texts) == 1000: yield texts texts = [] if texts:
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer from keras.models import load_model import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636
def get_tokenizer(cls,): if cls.tokenizer is not None: return cls.tokenizer else: cls.tokenizer = Tokenizer(cls.get_token_dict()[0]) return cls.tokenizer
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.bert import load_pretrained_model from bert4keras.utils import Tokenizer, load_vocab import numpy as np config_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '../../kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '../../kg/bert/chinese_L-12_H-768_A-12/vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = Tokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
# 所以只需要看第一个,不需要遍历后面的。 if i == 0 and j > 0: continue for k in _topk_arg[j]: _candidate_ids.append(ids + [k + 3]) _candidate_scores.append(sco + _log_probas[j][k]) _topk_arg = np.argsort(_candidate_scores)[-topk:] # 从中选出新的topk target_ids = [_candidate_ids[k] for k in _topk_arg] target_scores = [_candidate_scores[k] for k in _topk_arg] best_one = np.argmax(target_scores) if target_ids[best_one][-1] == self.token_dict.get("[SEP]"): return self.tokenizer.decode(target_ids[best_one]) # 如果max_output_len字都找不到结束符,直接返回 return self.tokenizer.decode(target_ids[np.argmax(target_scores)]) def get_token_dict(token_file): with open(token_file, "r") as f: token_list = f.readlines() token_dict = {word.strip(): id_ for id_, word in enumerate(token_list)} return token_dict if __name__ == "__main__": dict_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt' token_dict = get_token_dict(dict_path) tokenizer = Tokenizer(token_dict) seq_model = trans_infer(tokenizer, token_dict) # ans = seq_model.gen_trans(input_.lower(), topk) print(seq_model.gen_trans("NLP简直太神奇了".lower(), 2))
if c in self._token_dict: R.append(c) elif self._is_space(c): R.append('[unused1]') # space类用未经训练的[unused1]表示 else: R.append('[UNK]') # 剩余的字符是[UNK] return R dict_path = '/opt/developer/wp/wzcq/roberta_wwm/vocab.txt' token_dict = get_token_dict(dict_path) tokenizer = OurTokenizer(token_dict) trans_dic_path = '/opt/developer/wp/nlpapp/train/multilingual_L-12_H-768_A-12/vocab.txt' token_dict_trans = get_token_dict(trans_dic_path) trans_tokenizer = Tokenizer(token_dict_trans) @app.route('/') def hello_world(): data = {} return render_template("ci.html", **data) @app.route('/mc') def machine_read(): return render_template('mc.html') @app.route('/ci') def generate_ci(): return render_template('ci.html') @app.route('/trans')
def __init__(self, train_path, token_dict): self.train_path = train_path self.tokenizer = Tokenizer(token_dict)