def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.RobertaProcessing( sep=(ot.sep_token, ot.sep_token_id), cls=(ot.cls_token, ot.cls_token_id), add_prefix_space=ot.add_prefix_space, trim_offsets=True, # True by default on Roberta (historical) ) return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="</w>", fuse_unk=False, unk_token=str(unk_token), )) tokenizer.normalizer = normalizers.Sequence([ normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase() ]) tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split( Regex( r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" ), behavior="removed", invert=True, ), pre_tokenizers.ByteLevel(add_prefix_space=False), ]) tokenizer.decoder = decoders.ByteLevel() # Hack to have a ByteLevel and TemplaceProcessor tokenizer.post_processor = processors.RobertaProcessing( sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), add_prefix_space=False, trim_offsets=False, ) return tokenizer