Ejemplo n.º 1
0
"""
@File   : load_albert.py
@Author : Pengy
@Date   : 2020/9/28
@Description : Input your description here ... 
"""
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import SpTokenizer
from keras.layers import LSTM, Dense
from keras.models import Model
import numpy as np

config_path = '../Models/albert_base_v2/albert_base/albert_config.json'
checkpoint_path = '../Models/albert_base_v2/albert_base/model.ckpt-best'
vocab_path = '../Models/albert_base_v2/albert_base/30k-clean.vocab'
spm_path = '../Models/albert_base_v2/albert_base/30k-clean.model'

tokenizer = SpTokenizer(spm_path)
model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='albert')
model.summary()

token_ids, segment_ids = tokenizer.encode('language model')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

output = LSTM(64)(model.output)
output = Dense(32)(output)
my_model = Model(model.input, output)
my_model.summary()
Ejemplo n.º 2
0
    else:
        i = sp_tokenizer.token_to_id(u'\u2581' + t)
        if i == 2:
            i = sp_tokenizer.token_to_id(t)
    if i != 2:
        keep_tokens.append(i)
        new_token_dict[t] = len(new_token_dict)

keep_tokens = [2] * 106 + keep_tokens
keep_tokens_inv = {j: i for i, j in enumerate(keep_tokens)}

compound_tokens = []
for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
    if t not in new_token_dict:
        new_token_dict[t] = len(new_token_dict)
        ids = [keep_tokens_inv.get(i, 0) for i in sp_tokenizer.encode(t)[0]]
        compound_tokens.append(ids)

save_vocab(dict_path_2, new_token_dict)

# 构建分词器
tokenizer = Tokenizer(new_token_dict,
                      do_lower_case=True,
                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))


def corpus():
    """语料生成器
    """
    while True:
        f = '/root/data_pretrain/data_shuf.json'