def bpe_train(self, paths): trainer = BpeTrainer( vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) self.tokenizer.train(paths, trainer)
def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def load_or_train_tokenizer(file_paths, tokenizer_mode_path): ''' Tries to load saved text tokenizer If there is none, trains the new tokenizer and saves is ''' if not os.path.exists(tokenizer_mode_path): print('Tokenizer model not found, training one') from tokenizers.models import BPE from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.normalizers import NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer( vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>" ] ) tokenizer.train(file_paths, trainer) if not os.path.exists(tokenizer_mode_path): os.makedirs(tokenizer_mode_path) tokenizer.model.save(tokenizer_mode_path, None) print('Loading trained tokenizer model') tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_mode_path) tokenizer.add_special_tokens({ 'eos_token': '</s>', 'bos_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>' }) return tokenizer
def bpe_train(self, paths): trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", "<company>", "<label>", "<category>", "<review>", ]) self.tokenizer.train(trainer, paths)
def test_has_alphabet(self): assert isinstance(ByteLevel.alphabet(), list) assert len(ByteLevel.alphabet()) == 256
from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.models import BPE from tokenizers.normalizers import Lowercase, NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer path_data = "../../ml-datasets/wmt14/tokenizer/" path_train_src = "../../ml-datasets/wmt14/train.en" path_train_tgt = "../../ml-datasets/wmt14/train.de" tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(), min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ]) tokenizer.train(trainer, [path_train_src, path_train_tgt]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(path_data)
# Our tokenizer also needs a pre-tokenizer responsible for converting the input # to a ByteLevel representation. tokenizer.pre_tokenizer = ByteLevel() # And finally, let's plug a decoder so we can recover from a tokenized input # to the original one tokenizer.decoder = ByteLevelDecoder() from tokenizers.trainers import BpeTrainer # We initialize our trainer, giving him the details about the vocabulary we want # to generate trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet()) tokenizer.train(trainer, ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) # Et voilà ! You trained your very first tokenizer from scratch using tokenizers. # Of course, this covers only the basics, and you may want to have a look at the # add_special_tokens or special_tokens parameters on the Trainer class, but the # overall process should be very similar. # You will see the generated files in the output. tokenizer.model.save('/Volumes/750GB-HDD/root/Question-Answering/pyData') # Let's tokenizer a simple input