def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) return tokenizer
def post_processor(self): eos = self.original_tokenizer.eos_token special_tokens = [ (eos, self.original_tokenizer.eos_token_id), ] return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens)
def post_processor(self): return processors.TemplateProcessing( single=["$A", "</s>"], pair=["$A", "</s>", "$B", "</s>"], special_tokens=[ ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")), ], )
def post_processor(self): return processors.TemplateProcessing( single="$A:0 <sep>:0 <cls>:2", pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2", special_tokens=[ ("<sep>", self.original_tokenizer.convert_tokens_to_ids("<sep>")), ("<cls>", self.original_tokenizer.convert_tokens_to_ids("<cls>")), ], )
def post_processor(self): return processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], )
def post_processor(self): return processors.TemplateProcessing( single="<s> $A </s>", pair="<s> $A </s> </s> $B </s>", special_tokens=[ ("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")), ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")), ], )
def set_tgt_lang_special_tokens(self, lang): self.cur_lang_code = self.convert_tokens_to_ids(lang) self.prefix_tokens = [] self.suffix_tokens = [self.EOS, self.cur_lang_code] prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) self._tokenizer.post_processor = processors.TemplateProcessing( single=prefix_tokens_str + ["$A"] + suffix_tokens_str, pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, special_tokens=list( zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), )
def set_tgt_lang_special_tokens(self, lang: str) -> None: """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos].""" self.cur_lang_code = self.convert_tokens_to_ids(lang) self.prefix_tokens = [] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) self._tokenizer.post_processor = processors.TemplateProcessing( single=prefix_tokens_str + ["$A"] + suffix_tokens_str, pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), )
def set_src_lang_special_tokens(self, src_lang) -> None: """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code].""" self.cur_lang_code = self.convert_tokens_to_ids(src_lang) self.prefix_tokens = [] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) self._tokenizer.post_processor = processors.TemplateProcessing( single=prefix_tokens_str + ["$A"] + suffix_tokens_str, pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), )
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) # # Let the tokenizer know about special tokens if they are part of the vocab # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)]) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single= f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) question = str(self.original_tokenizer.question_token) dot = "." cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id question_token_id = self.original_tokenizer.question_token_id dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".") if self.original_tokenizer.padding_side == "right": pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1" else: pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1" tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=pair, special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), (question, question_token_id), (dot, dot_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': for sentence in sent_tokenizer(record['text']): yield sentence data = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec( shape=(None), dtype=tf.string))) data = data.map(tf.strings.strip, num_parallel_calls=tf.data.experimental.AUTOTUNE) return tokenizer, data
def train_tokenizer() -> tokenizers.Tokenizer: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="validation") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': yield record['text'] return tokenizer, generator
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair= f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1", # MPNet uses two [SEP] tokens special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: from .models.roformer.tokenization_utils import JiebaPreTokenizer vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=False, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom( JiebaPreTokenizer(vocab)) cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def train_custom_tokenizer(dataset, token_model, tknzr_file, vocab_size, vocab=None, pretrain_fast=False, max_input_chars_per_word=None, eos_token=None, bos_token=None, pad_token=None, mask_token=None, unk_token=None): """ Building a Tokenizer using HuggingFace library. The pipeline seems to be: - Model : algorithm that tokenizes, it is a mandatory component. There are only 4 models implemented (BPE, Unigram, WordLevel, WordPiece) - Normalizer : some preprocessing that could happen before, but doesn't necessarily have to - Pre-Tokenizer : splitting the input according to some rules - Post-Processing : needing to add some tokens/input after (mostly seems to be eos, bos tokens) - Decoder : certain previous pipeline steps need to be reversed for proper decoding - Trainer : The corresponding training algorithm for the model Note : Some pre-processing might need to happen beforehand in previous functions (might be easier using pandas before) Input token_model (str) : algorithm to use for tokenization dataset (class) : a python iterator that goes through the data to be used for training token_dir (str) : directory with tokenizers vocab_size (int) : size of the vocabulary to use tokenFilename (str) : filename of particular token we want to train. Will overwrite previously save files. vocab (list of str) : models other than BPE can use non-mandatory vocab as input max_input_chars_per_word : used for WordPiece Output tokenizer : huggingFace Tokenizer object, our fully trainer tokenizer """ special_token_lst = [ pad_token, bos_token, eos_token, mask_token, unk_token ] # NFKC normalizer_lst = [] pre_tokenizer_lst = [Whitespace, ByteLevel] decoder_lst = [] bos_idx = special_token_lst.index(bos_token) eos_idx = special_token_lst.index(eos_token) if token_model == 'BPE': model = BPE(unk_token=unk_token) Trainer = BpeTrainer elif token_model == 'Unigram': model = Unigram(vocab=vocab) Trainer = UnigramTrainer elif token_model == 'WordLevel': model = WordLevel(unk_token=unk_token, vocab=vocab) Trainer = WordLevelTrainer elif token_model == 'WordPiece': model = WordPiece(unk_token=unk_token, vocab=vocab, max_input_chars_per_word=max_input_chars_per_word) Trainer = WordPieceTrainer else: error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \ % VALID_TOKENIZATIONS raise SystemExit(error_msg) # instantiation tokenizer = Tokenizer(model) # Select a tokenization trainer if vocab_size is None: trainer = Trainer(show_progress=True, special_tokens=special_token_lst) else: trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_token_lst) # Set the normalizer tokenizer.normalizer = normalizers.Sequence( [fcn() for fcn in normalizer_lst]) # Set the pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [fcn() for fcn in pre_tokenizer_lst]) # Set the post-processing tokenizer.post_processor = processors.TemplateProcessing( single=bos_token + " $A " + eos_token, special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)], # pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1", ) # Set the decoder if ByteLevel in pre_tokenizer_lst: tokenizer.decoder = decoders.ByteLevel() if Metaspace in pre_tokenizer_lst: tokenizer.decoder = decoders.Metaspace() if token_model == 'WordPiece': tokenizer.decoder = decoders.WordPiece() # creating iterator def batch_iterator(): for i in np.arange(0, len(dataset)): yield dataset[i] # train call tokenizer.train_from_iterator(trainer=trainer, iterator=batch_iterator(), length=len(dataset)) if Path(tknzr_file).exists(): print(f"Warning : overwriting previously save tokenizer with\ same filename ( {tknzr_file} ).") tokenizer.save(tknzr_file) if pretrain_fast: tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file) tokenizer.pad_token = pad_token tokenizer.mask_token = mask_token return tokenizer