def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def test_can_modify(self): pretok = ByteLevel(add_prefix_space=False) assert pretok.add_prefix_space == False # Modify these pretok.add_prefix_space = True assert pretok.add_prefix_space == True
def __init__(self): self.tokenizer = Tokenizer(BPE()) self.tokenizer.normalizer = Sequence([ NFKC() ]) self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def tokenizer_pipeline(): """ specific pipeline for Cebuano Corpus tokenization - Uses a Byte pair encoding (BPE) tokenizer """ tokenizer = Tokenizer(BPE()) # string normalization tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() return tokenizer
def bpe_train(self, paths): trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet(), special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", "<company>", "<label>", "<category>", "<review>", ]) self.tokenizer.train(trainer, paths)
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
def test_instantiate(self): assert ByteLevel() is not None assert ByteLevel(add_prefix_space=True) is not None assert ByteLevel(add_prefix_space=False) is not None assert isinstance(ByteLevel(), PreTokenizer)
from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.models import BPE from tokenizers.normalizers import Lowercase, NFKC, Sequence from tokenizers.pre_tokenizers import ByteLevel # First we create an empty Byte-Pair Encoding model (i.e. not trained model) tokenizer = Tokenizer(BPE()) # Then we enable lower-casing and unicode-normalization # The Sequence normalizer allows us to combine multiple Normalizer that will be # executed in order. tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # Our tokenizer also needs a pre-tokenizer responsible for converting the input # to a ByteLevel representation. tokenizer.pre_tokenizer = ByteLevel() # And finally, let's plug a decoder so we can recover from a tokenized input # to the original one tokenizer.decoder = ByteLevelDecoder() from tokenizers.trainers import BpeTrainer # We initialize our trainer, giving him the details about the vocabulary we want # to generate trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet()) tokenizer.train(trainer, ["/Volumes/750GB-HDD/root/Question-Answering/pyData/big.txt"])
def test_manual_reload(self): byte_level = ByteLevel() state = json.loads(byte_level.__getstate__()) reloaded = ByteLevel(**state) assert isinstance(reloaded, ByteLevel)
tokenizer = Tokenizer(BPE()) # string normalization tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() return tokenizer if __name__ == "__main__": # preparing corpus for wiki en_vocab_size = 50257 wiki_txt = load_text_file_json('text/AA/wiki_00.json', 'text') write_text_file(wiki_txt, 'wiki-corpus.txt') corpus_files = { 'wiki-corpus': 'wiki-corpus.txt', 'oscar-corpus': 'shuff-dedup/ceb/ceb_dedup.txt' } # define a trainer for the tokenizer trainer = BpeTrainer(vocab_size=en_vocab_size, show_progress=True, initial_alphabet=ByteLevel.alphabet(), special_tokens=['<|endoftext|>', '<pad>']) for corpus, path in corpus_files.items(): tokenizer = tokenizer_pipeline() tokenizer.train([path], trainer) tokenizer.save(f'model/{corpus}-tokenizer.json')