Esempio n. 1
0
import fire
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, Lowercase, Strip


def train(dataset_path,
          output_dir='data/tokenizer/',
          vocab_size=30_000,
          min_frequency=3):

    trainer = BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = Sequence([Lowercase(), Strip()])

    files = [dataset_path]
    tokenizer.train(trainer, files)

    files = tokenizer.model.save(output_dir)
    tokenizer.model = BPE.from_file(*files, unk_token='[UNK]')

    tokenizer.save(f'{output_dir}/tokenizer.json')


if __name__ == '__main__':
Esempio n. 2
0
    Lowercase(),
])

# 加入pre-tokenizer
tokenizer.pre_tokenizer = ByteLevel()

# 加入Decoder
tokenizer.decoder = ByteLevelDecoder()

# %%
from tokenizers.trainers import BpeTrainer

# %%
# 创建BPE Trainer
trainer = BpeTrainer(vocab_size=25000,
                     show_progress=True,
                     initial_alphabet=ByteLevel.alphabet())

# 训练BPE model
tokenizer.train(trainer, ['data/big.txt'])
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

# %%
SAVE_PATH = Path('tokenizers')
PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model'
if not PATH.exists():
    PATH.mkdir(parents=True, exist_ok=True)

# %%
# 保存模型
tokenizer.model.save(str(PATH))
Esempio n. 3
0
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(trainer, [
    "wikitext-2-raw/wiki.train.raw", "wikitext-2-raw/wiki.valid.raw",
    "wikitext-2-raw/wiki.test.raw"
])

output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)

print(2333)