Beispiel #1
0
 def from_pretrained(cls,
                     pretrained_model_name_or_path,
                     use_fast=True,
                     do_basic_tokenize=True) -> PreTrainedTokenizer:
     if isinstance(pretrained_model_name_or_path, str):
         transformer = pretrained_model_name_or_path
     else:
         transformer = pretrained_model_name_or_path.transformer
     additional_config = dict()
     if transformer.startswith('voidful/albert_chinese_'):
         cls = BertTokenizer
     elif transformer == 'cl-tohoku/bert-base-japanese-char':
         # Since it's char level model, it's OK to use char level tok instead of fugashi
         # from hanlp.utils.lang.ja.bert_tok import BertJapaneseTokenizerFast
         # cls = BertJapaneseTokenizerFast
         from transformers import BertJapaneseTokenizer
         cls = BertJapaneseTokenizer
         # from transformers import BertTokenizerFast
         # cls = BertTokenizerFast
         additional_config['word_tokenizer_type'] = 'basic'
     else:
         cls = AutoTokenizer
     if use_fast and not do_basic_tokenize:
         warnings.warn(
             '`do_basic_tokenize=False` might not work when `use_fast=True`'
         )
     tokenizer = cls.from_pretrained(get_mirror(transformer),
                                     use_fast=use_fast,
                                     do_basic_tokenize=do_basic_tokenize,
                                     **additional_config)
     tokenizer.name_or_path = transformer
     return tokenizer
Beispiel #2
0
 def from_pretrained(cls,
                     pretrained_model_name_or_path,
                     *model_args,
                     training=True,
                     **kwargs):
     if training:
         return super().from_pretrained(pretrained_model_name_or_path,
                                        *model_args, **kwargs)
     else:
         if isinstance(pretrained_model_name_or_path, str):
             pretrained_model_name_or_path = get_mirror(
                 pretrained_model_name_or_path)
             return super().from_config(
                 AutoConfig.from_pretrained(pretrained_model_name_or_path,
                                            **kwargs))
         else:
             assert not kwargs
             return super().from_config(pretrained_model_name_or_path)
Beispiel #3
0
 def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
     pretrained_model_name_or_path = get_mirror(
         pretrained_model_name_or_path)
     return super().from_pretrained(pretrained_model_name_or_path, **kwargs)