Python SpTokenizer Examples

Programming Language: Python

Namespace/Package Name: bert4keras.tokenizers

Class/Type: SpTokenizer

Examples at hotexamples.com: 7

Python SpTokenizer - 7 examples found. These are the top rated real world Python examples of bert4keras.tokenizers.SpTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SpTokenizer(6)

encode(2)

token_to_id(1)

Example #1

Show file

File: basic_language_model_cpm_lm.py Project: zouxiaoyuonly/bert4keras

checkpoint_path = '/root/kg/bert/CPM_LM_2.6B_TF/model.ckpt'
spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model'


def pre_tokenize(text):
    """分词前处理函数
    """
    return [
        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
        for w in jieba.cut(text, cut_all=False)
    ]


tokenizer = SpTokenizer(spm_path,
                        token_start=None,
                        token_end=None,
                        pre_tokenize=pre_tokenize,
                        token_translate={u'\u2583': '<cls>'})  # 建立分词器

model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='gpt2')  # 建立模型，加载权重


class TextExpansion(AutoRegressiveDecoder):
    """基于随机采样的文本续写
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids = np.concatenate([inputs[0], output_ids], 1)
        return model.predict(token_ids)[:, -1]

Example #2

Show file

File: basic_language_model_cpm_lm.py Project: CurisZhou/bert4keras

spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model'


def pre_tokenize(text):
    """分词前处理函数
    """
    return [
        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
        for w in jieba.cut(text, cut_all=False)
    ]


tokenizer = SpTokenizer(
    spm_path,
    token_start=None,
    token_end=None,
    pre_tokenize=pre_tokenize,
    # '\u2583'为换行符，此处为将换行符替换为'<cls>'特殊符.
    token_translate={u'\u2583': '<cls>'}
)  # 建立分词器

model = build_transformer_model(
    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2'
)  # 建立模型，加载权重


class TextExpansion(AutoRegressiveDecoder):
    """基于随机采样的文本续写
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids = np.concatenate([inputs[0], output_ids], 1)

Example #3

Show file

File: task_seq2seq_autotitle_csl_mt5.py Project: zouxiaoyuonly/bert4keras

    """
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            title, content = l.strip().split('\t')
            D.append((title, content))
    return D


# 加载数据集
train_data = load_data('/root/csl/train.tsv')
valid_data = load_data('/root/csl/val.tsv')
test_data = load_data('/root/csl/test.tsv')

# 加载分词器
tokenizer = SpTokenizer(spm_path, token_start=None, token_end='</s>')
keep_tokens = json.load(open(keep_tokens_path))


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_c_token_ids, batch_t_token_ids = [], []
        for is_end, (title, content) in self.sample(random):
            c_token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
            t_token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
            batch_c_token_ids.append(c_token_ids)
            batch_t_token_ids.append([0] + t_token_ids)
            if len(batch_c_token_ids) == self.batch_size or is_end:
                batch_c_token_ids = sequence_padding(batch_c_token_ids)

Example #4

Show file

"""
@File   : load_albert.py
@Author : Pengy
@Date   : 2020/9/28
@Description : Input your description here ... 
"""
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import SpTokenizer
from keras.layers import LSTM, Dense
from keras.models import Model
import numpy as np

config_path = '../Models/albert_base_v2/albert_base/albert_config.json'
checkpoint_path = '../Models/albert_base_v2/albert_base/model.ckpt-best'
vocab_path = '../Models/albert_base_v2/albert_base/30k-clean.vocab'
spm_path = '../Models/albert_base_v2/albert_base/30k-clean.model'

tokenizer = SpTokenizer(spm_path)
model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='albert')
model.summary()

token_ids, segment_ids = tokenizer.encode('language model')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

output = LSTM(64)(model.output)
output = Dense(32)(output)
my_model = Model(model.input, output)
my_model.summary()

Example #5

Show file

checkpoint_path = os.path.join(bert_path, 'model.ckpt-best')
dict_path = os.path.join(bert_path, '30k-clean.vocab')
spm_path = os.path.join(bert_path, '30k-clean.model')

# load data
def load_data(filename):
    D = []
    with open(filename, encoding='gb2312') as f:
        for l in f:
            text, label = l.strip().split('\t')
            D.append((text, int(label)))
    return D


# Create a tokenizer
tokenizer = SpTokenizer(spm_path)
#tokenizer = Tokenizer(dict_path, do_lower_case=True)


class data_generator(DataGenerator):
    """data generator
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)

Example #6

Show file

File: train.py Project: peterli1001/t5-pegasus

epochs = 100000
summary_rate = 0.25
t_maxlen = maxlen // 4
s_maxlen = maxlen - t_maxlen

# T5配置
config_path = '/root/kg/bert/mt5/mt5_base/mt5_base_config.json'
checkpoint_path = '/root/kg/bert/mt5/mt5_base/model.ckpt-1000000'
spm_path = '/root/kg/bert/mt5/sentencepiece.model'

# PEGASUS
dict_path_1 = '/root/kg/bert/chinese_pegasus_L-12_H-768_A-12/vocab.txt'
dict_path_2 = '/root/kg/bert/chinese_t5_pegasus_base/vocab.txt'

# 构建词表
sp_tokenizer = SpTokenizer(spm_path, token_start=None, token_end=None)
token_dict = load_vocab(dict_path_1)
keep_tokens, new_token_dict, n = [], {}, 0
for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
    if n < 106:
        new_token_dict[t] = n
        n += 1
        continue
    if t.startswith('##'):
        i = sp_tokenizer.token_to_id(t[2:])
        if i == 2:
            i = sp_tokenizer.token_to_id(u'\u2581' + t)
    else:
        i = sp_tokenizer.token_to_id(u'\u2581' + t)
        if i == 2:
            i = sp_tokenizer.token_to_id(t)

Example #7

Show file

 def setUpClass(cls) -> None:
     model_path = '../models/mt5_base/sentencepiece_cn.model'
     cls.raw_tokenizer = SpTokenizer(model_path, token_start=None, token_end='</s>')
     cls.my_tokenizer = SentencePieceTokenizer(model_path, token_start=None, token_end='</s>')