Esempio n. 1
0
# -*- coding: utf-8 -*-
# @Date    : 2020/7/16
# @Author  : mingming.xu
# @Email   : [email protected]
# @File    : mask_language_model.py
import numpy as np

from toolkit4nlp.tokenizers import Tokenizer

from toolkit4nlp.models import build_transformer_model


config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json'
ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt'
vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(vocab, do_lower_case=True)

model = build_transformer_model(config, checkpoint_path=ckpt, with_mlm=True)

# tokens, segs = tokenizer.encode('北京网聘技术有限公司')
tokens, segs = tokenizer.encode('科学技术是第一生产力')
tokens[3] = tokens[4] = tokenizer._token_dict['[MASK]']

prob = model.predict([np.array([tokens]), np.array([segs])])[0]
print(tokenizer.decode(np.argmax(prob[3:5], axis=1)))
'''
正确结果应该是: 技术
'''
Esempio n. 2
0
# @Author  : mingming.xu
# @Email   : [email protected]
# @File    : extract_feature.py
from toolkit4nlp.models import build_transformer_model
from toolkit4nlp.tokenizers import Tokenizer
import numpy as np

config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json'
ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt'
vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(vocab, do_lower_case=True)

model = build_transformer_model(config, checkpoint_path=ckpt)

token_ids, segment_ids = tokenizer.encode(u'我爱你中国')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
'''[[[-0.00827767  0.52711666 -0.2616654  ...  0.7717162   0.6682844
   -0.3481327 ]
  [ 0.3665638   0.35970846  0.0772187  ... -0.5211092  -0.46724823
    0.07845997]
  [ 0.6985213  -0.04391993 -1.3160559  ...  1.061864    0.8293197
    0.07258661]
  ...
  [ 0.25169933  0.3048255  -1.2513847  ...  0.5438095   0.46753633
   -0.61883307]
  [ 0.07904327 -0.08373377 -0.3963912  ...  0.29524678  0.74877214
   -0.27334687]
  [ 0.22920786  0.10579191  0.38394836 ...  0.60277367  0.02615384
target_in = Input(shape=(None, ))
output = CrossEntropy(1)([target_in, model.output])

train_model = Model(model.inputs + [target_in], output)

AdamW = extend_with_weight_decay(Adam)
AdamWG = extend_with_gradient_accumulation(AdamW)

opt = AdamWG(learning_rate=1e-5,
             exclude_from_weight_decay=['Norm', 'bias'],
             grad_accum_steps=4)
train_model.compile(opt)
train_model.summary()

label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels])


def predict(x):
    if len(x) == 3:
        x = x[:2]
    y_pred = model.predict(x)[:, mask_idx]
    y_pred = y_pred[:, 0, label_ids[:, 0]]
    y_pred = y_pred.argmax(axis=1)
    return y_pred


def evaluate(data):
    P, R, TP = 0., 0., 0.
    for d, _ in tqdm(data):
        x_true, y_true = d[:2], d[2]