# -*- coding: utf-8 -*- # @Date : 2020/7/16 # @Author : mingming.xu # @Email : [email protected] # @File : mask_language_model.py import numpy as np from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.models import build_transformer_model config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) model = build_transformer_model(config, checkpoint_path=ckpt, with_mlm=True) # tokens, segs = tokenizer.encode('北京网聘技术有限公司') tokens, segs = tokenizer.encode('科学技术是第一生产力') tokens[3] = tokens[4] = tokenizer._token_dict['[MASK]'] prob = model.predict([np.array([tokens]), np.array([segs])])[0] print(tokenizer.decode(np.argmax(prob[3:5], axis=1))) ''' 正确结果应该是: 技术 '''
# @Author : mingming.xu # @Email : [email protected] # @File : extract_feature.py from toolkit4nlp.models import build_transformer_model from toolkit4nlp.tokenizers import Tokenizer import numpy as np config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) model = build_transformer_model(config, checkpoint_path=ckpt) token_ids, segment_ids = tokenizer.encode(u'我爱你中国') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) '''[[[-0.00827767 0.52711666 -0.2616654 ... 0.7717162 0.6682844 -0.3481327 ] [ 0.3665638 0.35970846 0.0772187 ... -0.5211092 -0.46724823 0.07845997] [ 0.6985213 -0.04391993 -1.3160559 ... 1.061864 0.8293197 0.07258661] ... [ 0.25169933 0.3048255 -1.2513847 ... 0.5438095 0.46753633 -0.61883307] [ 0.07904327 -0.08373377 -0.3963912 ... 0.29524678 0.74877214 -0.27334687] [ 0.22920786 0.10579191 0.38394836 ... 0.60277367 0.02615384
target_in = Input(shape=(None, )) output = CrossEntropy(1)([target_in, model.output]) train_model = Model(model.inputs + [target_in], output) AdamW = extend_with_weight_decay(Adam) AdamWG = extend_with_gradient_accumulation(AdamW) opt = AdamWG(learning_rate=1e-5, exclude_from_weight_decay=['Norm', 'bias'], grad_accum_steps=4) train_model.compile(opt) train_model.summary() label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) def predict(x): if len(x) == 3: x = x[:2] y_pred = model.predict(x)[:, mask_idx] y_pred = y_pred[:, 0, label_ids[:, 0]] y_pred = y_pred.argmax(axis=1) return y_pred def evaluate(data): P, R, TP = 0., 0., 0. for d, _ in tqdm(data): x_true, y_true = d[:2], d[2]