def test_instantiate(self, roberta_files): assert isinstance(WordLevel(), Model) assert isinstance(WordLevel(), WordLevel) # The WordLevel model expects a vocab.json using the same format as roberta # so we can just try to load with this file assert isinstance(WordLevel(roberta_files["vocab"]), Model) assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
def get_recurrent_tokenizer(vocab, max_context_tokens, unk_token, pad_token, device="cpu"): """ Return a tokenizer to be used with recurrent-based models """ question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) question_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) question_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) question_tokenizer.enable_padding(direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token) context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) context_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) context_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) context_tokenizer.enable_padding( direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token, ) context_tokenizer.enable_truncation(max_context_tokens) return RecurrentSquadTokenizer(question_tokenizer, context_tokenizer, device=device)
def test_can_modify(self): model = WordLevel(unk_token="<oov>") assert model.unk_token == "<oov>" # Modify these model.unk_token = "<unk>" assert model.unk_token == "<unk>"
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "[UNK]", pad_token: Union[str, AddedToken] = "[PAD]", mask_token: Union[str, AddedToken] = "[MASK]", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: logging.info(f"Initiating tokenizer at {vocab_file}") tokenizer = Tokenizer( WordLevel(vocab=vocab_file, unk_token=unk_token)) else: tokenizer = Tokenizer(WordLevel(unk_token=unk_token)) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() parameters = { "model": "WordLevel", "unk_token": unk_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file) ) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token)) ) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def __create_tokenizer(self, files): # Create, train and save the tokenizer. print("Preparing tokenizer...") tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) tokenizer.pre_tokenizer = WhitespaceSplit() trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=files, trainer=trainer) return tokenizer
def get_model(name: str): if name == "wordpiece": return WordPiece(unk_token=UNK_TOKEN) elif name == "bpe": return BPE(unk_token=UNK_TOKEN) elif name == "unigram": return Unigram() elif name == "word": return WordLevel(unk_token=UNK_TOKEN) else: raise AssertionError(f"{name} type model is not granted.")
def test_works_in_simple_pipeline(self): pretok = self.dict.pre_tokenizer() vocab = { "[UNK]": 0, "京都": 1, "に": 2, "行く": 3 } tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) tok.pre_tokenizer = pretok res = tok.encode("京都へ行く") self.assertEqual(res.ids, [1, 0, 3])
def test_with_handler(self): def _handler(index, sentence: tokenizers.NormalizedString, ml: MorphemeList): return [tokenizers.NormalizedString(ml[0].part_of_speech()[0]), tokenizers.NormalizedString(str(len(ml)))] pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A, handler=_handler) vocab = { "[UNK]": 0, "名詞": 6, "4": 7, } tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) tok.pre_tokenizer = pretok res = tok.encode("外国人参政権") self.assertEqual(res.ids, [6, 7])
def test_works_with_different_split_mode(self): pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A) vocab = { "[UNK]": 0, "外国": 1, "参政": 2, "権": 3, "人": 5, "外国人参政権": 4 } tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]")) tok.pre_tokenizer = pretok res = tok.encode("外国人参政権") self.assertEqual(res.ids, [1, 5, 2, 3])
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def test_instantiate(self, roberta_files): assert isinstance(WordLevel(), Model) assert isinstance(WordLevel(), WordLevel) vocab = {"a": 0, "b": 1, "ab": 2} assert isinstance(WordLevel(vocab), Model) assert isinstance(WordLevel(vocab), WordLevel) assert isinstance(WordLevel.from_file(roberta_files["vocab"]), WordLevel) # The WordLevel model expects a vocab.json using the same format as roberta # so we can just try to load with this file with pytest.deprecated_call(): assert isinstance(WordLevel(roberta_files["vocab"]), Model) with pytest.deprecated_call(): assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
def main(args): # copy from https://github.com/xinjli/allosaurus ipa0 = [ 'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞', 'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z', 'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː', 'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥', 'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː', 'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ', 'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p', 'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː', 's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ', 'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚', 't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə', 'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y', 'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m', 'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤', 'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞', 'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ', 'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ', 'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ' ] ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy() random.shuffle(ipa1) random.shuffle(ipa2) random.shuffle(ipa3) # randomly joined to form training data passage0 = ' '.join(ipa0) passage1 = ' '.join(ipa1) passage2 = ' '.join(ipa2) passage3 = ' '.join(ipa3) data = [passage0, passage1, passage2, passage3] # setup tokenizer = Tokenizer(WordLevel(unk_token="<unk>")) # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) trainer = WordLevelTrainer( vocab_size=300, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.pre_tokenizer = Whitespace() # train the tokenizer tokenizer.train_from_iterator(data, trainer=trainer) tokenizer.save(args.outdir + '/ipa_tokenizer.json')
def __init__( self, vocab_file, sep_token="<sep>", cls_token="<cls>", pad_token="<pad>", mask_token="<mask>", lowercase: bool = True, ): tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token)) tokenizer.normalizer = Strip() tokenizer.pre_tokenizer = CharDelimiterSplit(" ") tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
from tokenizers import Tokenizer from tokenizers.pre_tokenizers import Whitespace from tokenizers.models import WordLevel VOCAB_FILE = "data/tx1_vocab.txt" with open(VOCAB_FILE, "r") as f: words = list(set(f.read().strip().split("\n"))) vocab = {} for i, word in enumerate(["<pad>", "<unk>"] + words): vocab[word] = i tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>")) tokenizer.enable_padding(pad_token="<pad>") tokenizer.pre_tokenizer = Whitespace() tokenizer.save("data/tokenizer-LakhNES-tx1.json")
import json data_path = Path('/workspace/poetry2021.gt/data/pan_tadeusz5') dataset_path = data_path / 'dataset' vocab_path = data_path / 'vocab.json' tokenizer_tmp_path = data_path / 'tokenizer_tmp' tokenizer_path = data_path / 'tokenizer' text_tokenizer = TextTokenizer(dataset_path) text_tokenizer.load_vocab(vocab_path) vocab = text_tokenizer.vocab vocab_count = len(vocab.keys()) vocab.update({'<|endoftext|>': vocab_count}) tokenizer_tmp = Tokenizer(WordLevel(text_tokenizer.vocab)) tokenizer_tmp.pre_tokenizer = CharDelimiterSplit(' ') tokenizer_tmp.post_processor = BertProcessing( ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")), ("<|endoftext|>", tokenizer_tmp.token_to_id("<|endoftext|>")), ) tokenizer_tmp_path.mkdir(parents=True, exist_ok=True) tokenizer_tmp.save(str(tokenizer_tmp_path / "tokenizer.json")) # Re-create as GPT2 compatible tokenizer class GPT2CompatibleTokenizer(PreTrainedTokenizerFast): def save_vocabulary(self,
"CHEF_CHECK": 6, "CHEF_DO": 7, "MOVE_CONTENTS": 8, } k = len(output_vocab) with open("../data/res2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w] = k k += 1 with open("../data/arg2idx.json", 'r') as f: for w, i in json.load(f).items(): output_vocab[w.replace('-', '_')] = k k += 1 output_vocab = {w: i for i, w in enumerate(output_vocab)} output_tokenizer = Tokenizer(WordLevel(output_vocab, )) output_tokenizer.pre_tokenizer = Whitespace() t = output_tokenizer.encode_batch( ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"]) # print (t) csv_file = '../data/seq2seq_4335716.csv' input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') input_tokenizer.bos_token = input_tokenizer.cls_token input_tokenizer.eos_token = input_tokenizer.sep_token val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]') train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]') # print(val_data) # print(train_data)
from tokenizers import Tokenizer, normalizers from tokenizers.models import WordLevel from tokenizers.pre_tokenizers import Whitespace from tokenizers.trainers import WordLevelTrainer from tokenizers.processors import TemplateProcessing t = Tokenizer(WordLevel(unk_token="[UNK]")) t.pre_tokenizer = Whitespace() trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]) t.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", # , # pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 2), ("[SEP]", 3), ]) files = ['tok-train-shuf-tgt.tsv'] t.train(files, trainer) t.save("code_tokenizer.json")
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
def __init__( self, vocab_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", sep_token: Union[str, AddedToken] = "<sep>", cls_token: Union[str, AddedToken] = "<cls>", pad_token: Union[str, AddedToken] = "<pad>", mask_token: Union[str, AddedToken] = "<mask>", lowercase: bool = False, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None: tokenizer = Tokenizer(WordLevel(vocab_file)) else: tokenizer = Tokenizer(WordLevel()) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() if vocab_file is not None: sep_token_id = tokenizer.token_to_id(str(sep_token)) if sep_token_id is None: raise TypeError("sep_token not found in the vocabulary") cls_token_id = tokenizer.token_to_id(str(cls_token)) if cls_token_id is None: raise TypeError("cls_token not found in the vocabulary") tokenizer.post_processor = processors.BertProcessing( (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, "unicode_normalizer": unicode_normalizer, } super().__init__(tokenizer, parameters)
def train_custom_tokenizer(dataset, token_model, tknzr_file, vocab_size, vocab=None, pretrain_fast=False, max_input_chars_per_word=None, eos_token=None, bos_token=None, pad_token=None, mask_token=None, unk_token=None): """ Building a Tokenizer using HuggingFace library. The pipeline seems to be: - Model : algorithm that tokenizes, it is a mandatory component. There are only 4 models implemented (BPE, Unigram, WordLevel, WordPiece) - Normalizer : some preprocessing that could happen before, but doesn't necessarily have to - Pre-Tokenizer : splitting the input according to some rules - Post-Processing : needing to add some tokens/input after (mostly seems to be eos, bos tokens) - Decoder : certain previous pipeline steps need to be reversed for proper decoding - Trainer : The corresponding training algorithm for the model Note : Some pre-processing might need to happen beforehand in previous functions (might be easier using pandas before) Input token_model (str) : algorithm to use for tokenization dataset (class) : a python iterator that goes through the data to be used for training token_dir (str) : directory with tokenizers vocab_size (int) : size of the vocabulary to use tokenFilename (str) : filename of particular token we want to train. Will overwrite previously save files. vocab (list of str) : models other than BPE can use non-mandatory vocab as input max_input_chars_per_word : used for WordPiece Output tokenizer : huggingFace Tokenizer object, our fully trainer tokenizer """ special_token_lst = [ pad_token, bos_token, eos_token, mask_token, unk_token ] # NFKC normalizer_lst = [] pre_tokenizer_lst = [Whitespace, ByteLevel] decoder_lst = [] bos_idx = special_token_lst.index(bos_token) eos_idx = special_token_lst.index(eos_token) if token_model == 'BPE': model = BPE(unk_token=unk_token) Trainer = BpeTrainer elif token_model == 'Unigram': model = Unigram(vocab=vocab) Trainer = UnigramTrainer elif token_model == 'WordLevel': model = WordLevel(unk_token=unk_token, vocab=vocab) Trainer = WordLevelTrainer elif token_model == 'WordPiece': model = WordPiece(unk_token=unk_token, vocab=vocab, max_input_chars_per_word=max_input_chars_per_word) Trainer = WordPieceTrainer else: error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \ % VALID_TOKENIZATIONS raise SystemExit(error_msg) # instantiation tokenizer = Tokenizer(model) # Select a tokenization trainer if vocab_size is None: trainer = Trainer(show_progress=True, special_tokens=special_token_lst) else: trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_token_lst) # Set the normalizer tokenizer.normalizer = normalizers.Sequence( [fcn() for fcn in normalizer_lst]) # Set the pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [fcn() for fcn in pre_tokenizer_lst]) # Set the post-processing tokenizer.post_processor = processors.TemplateProcessing( single=bos_token + " $A " + eos_token, special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)], # pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1", ) # Set the decoder if ByteLevel in pre_tokenizer_lst: tokenizer.decoder = decoders.ByteLevel() if Metaspace in pre_tokenizer_lst: tokenizer.decoder = decoders.Metaspace() if token_model == 'WordPiece': tokenizer.decoder = decoders.WordPiece() # creating iterator def batch_iterator(): for i in np.arange(0, len(dataset)): yield dataset[i] # train call tokenizer.train_from_iterator(trainer=trainer, iterator=batch_iterator(), length=len(dataset)) if Path(tknzr_file).exists(): print(f"Warning : overwriting previously save tokenizer with\ same filename ( {tknzr_file} ).") tokenizer.save(tknzr_file) if pretrain_fast: tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file) tokenizer.pad_token = pad_token tokenizer.mask_token = mask_token return tokenizer
pre_tokenizer = Whitespace() tokenized_texts = [[w for w, _ in pre_tokenizer.pre_tokenize_str(t)] for t in texts] c = Counter() for text in tokenized_texts: c.update(text) token2id = { word: i + 1 for i, (word, count) in enumerate(c.most_common(max_vocab_size)) } # usually, UNK is assigned index 0 or 1 token2id[unk_token] = 0 tokenizer = tokenizers.Tokenizer(WordLevel(token2id, unk_token)) tokenizer.pre_tokenizer = pre_tokenizer return tokenizer def accuracy(probs, targets): """Computes accuracy given predicted probabilities and expected labels. Args: probs: torch.FloatTensor[batch_size, 1], probabilities of a positive class targets: torch.LongTensor[batch_size, 1], true classes Returns: 0 <= float <= 1, proportion of correct predictions """ predictions = (probs >= 0.5).flatten()
from tokenizers import Tokenizer from tokenizers.models import BPE, WordLevel from tokenizers.processors import BertProcessing, TemplateProcessing from tokenizers import trainers from transformers import BertForMaskedLM from transformers import BertTokenizerFast from transformers import BertConfig import ipdb import os os.environ["CUDA_VISIBLE_DEVICES"] = "2" uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt' paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')] tokenizer = Tokenizer(WordLevel()) tokenizer.pre_tokenizer = Whitespace() # trainer = trainers.BpeTrainer( trainer = trainers.WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(trainer, [uid_task_id_sequence_path]) tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) # tokenizer.save_model("tmp")
def __init__( self, target_vocab, ): special_tokens = { "pad_token": "[PAD]", "unk_token": "[UNK]", "sep_token": "[SEP]", "cls_token": "[CLS]", "mask_token": "[MASK]", } vocab = {} vocab[special_tokens["pad_token"]] = 0 tkn_idx = 1 unused_ctr = 0 # not sure whether that's relevant, but fill 1..99 and 105...999 # with unused tokens to keep BERT's tokenizer style # as a result, one can easily identify special tokens: # 0 is padding # 1xx are other special tokens # any four-digit tokens are actual payload fill_tokens = False if(fill_tokens): while(tkn_idx < 100): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for token in ["unk_token", "cls_token", "sep_token", "mask_token"]: vocab[special_tokens[token]] = tkn_idx tkn_idx += 1 if(fill_tokens): while(tkn_idx < 1000): vocab[f"[unused{unused_ctr}]"] = tkn_idx tkn_idx += 1 unused_ctr += 1 for word in target_vocab: vocab[word] = tkn_idx tkn_idx += 1 tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token=special_tokens["unk_token"])) tokenizer.add_special_tokens(list(special_tokens.values())) tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() sep_token_id = tokenizer.token_to_id(special_tokens["sep_token"]) cls_token_id = tokenizer.token_to_id(special_tokens["cls_token"]) tokenizer.post_processor = processors.BertProcessing( (special_tokens["sep_token"], sep_token_id), (special_tokens["cls_token"], cls_token_id) ) parameters = special_tokens parameters["model"] = "WordLevel" super().__init__(tokenizer, parameters) tokenizer.save(PRETRAINED_TOKENIZER_FILE)
train_csv_df.to_csv(config['train_csv'], index=False, header=True) # Labelled test CSV file print("Save labelled csv for inference ", config['test_csv']) test_csv_df.to_csv(config['test_csv'], index=False, header=True) print("Setup tokenizers...") unknown_word = 'unknown_word' full_set = set(list(count_vector.vocabulary_.keys()) + list(word_list.keys())) #full_set = set(list(count_vector.vocabulary_.keys())) print("Number of words : (This has to be in config)", len(full_set) + 2) vocab = { w: i for i, w in enumerate([unknown_word, 'dumb_token'] + list(full_set)) } tokenizer = tokenizers.Tokenizer(WordLevel(vocab, unknown_word)) tokenizer.pre_tokenizer = Whitespace() print("Use padding length ", config['padding_length']) tokenizer.enable_padding(length=int(config['padding_length'])) # Save tokenizer recompute = False if recompute: print("Save tokenizer ", config['token_config']) tokenizer.save(config['token_config']) tokenizer = tokenizers.Tokenizer.from_file(config['token_config'])