def converted(self) -> Tokenizer: tokenizer_info_str = "#version:" token_suffix = "</w>" vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) if tokenizer_info_str in merges[0][0]: merges = merges[1:] tokenizer = Tokenizer( BPE( vocab, merges, dropout=None, unk_token=self.original_tokenizer.unk_token, end_of_word_suffix=token_suffix, )) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix) tokenizer.post_processor = processors.BertProcessing( sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id), cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id), ) return tokenizer
def __init__(self, vocab): self.vocab = vocab self.normalizers = normalizers.BertNormalizer( clean_text=False, handle_chinese_chars=True, strip_accents=False, lowercase=False, ) self.jieba = rjieba
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) # # Let the tokenizer know about special tokens if they are part of the vocab # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)]) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single= f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def __init__(self, vocab) -> None: self.vocab = vocab self.normalizers = normalizers.BertNormalizer( clean_text=False, handle_chinese_chars=True, strip_accents=False, lowercase=False, ) try: import rjieba except ImportError: raise ImportError( "You need to install rjieba to use RoFormerTokenizer. " "See https://pypi.org/project/rjieba/ for installation.") self.jieba = rjieba
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) question = str(self.original_tokenizer.question_token) dot = "." cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id question_token_id = self.original_tokenizer.question_token_id dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".") if self.original_tokenizer.padding_side == "right": pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1" else: pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1" tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=pair, special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), (question, question_token_id), (dot, dot_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair= f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1", # MPNet uses two [SEP] tokens special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: from .models.roformer.tokenization_utils import JiebaPreTokenizer vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=False, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom( JiebaPreTokenizer(vocab)) cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, unk_token=str(unk_token), end_of_word_suffix="</w>", fuse_unk=False, )) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix="</w>") return tokenizer