def get_tokenizer( tokenizer_name: str, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, ): """ Args: tokenizer_name: sentencepiece or pretrained model from the hugging face list, for example: bert-base-cased To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation procedure of BPE to help model better learn word compositionality and become robust to segmentation errors. It has emperically been shown to improve inference time BLEU scores. """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if 'megatron' in tokenizer_name: if not HAVE_APEX: raise RuntimeError("Apex required to use megatron.") if vocab_file is None: vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( tokenizer_name ) merges_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_merges_file( tokenizer_name ) tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens, legacy=True ) elif tokenizer_name == 'yttm': return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) elif tokenizer_name == 'word': return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) logging.info( f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}" ) return AutoTokenizer( pretrained_model_name=tokenizer_name, vocab_file=vocab_file, merges_file=merges_file, **special_tokens_dict, use_fast=use_fast, )
def get_tokenizer( tokenizer_name: str, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, use_fast: Optional[bool] = False, bpe_dropout: Optional[float] = 0.0, ): """ Args: tokenizer_name: sentencepiece or pretrained model from the hugging face list, for example: bert-base-cased To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece or youtokentome special_tokens: dict of special tokens vocab_file: path to vocab file use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if 'megatron' in tokenizer_name: if vocab_file is None: vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( tokenizer_name) tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens) elif tokenizer_name == 'yttm': return YouTokenToMeTokenizer(model_path=tokenizer_model, bpe_dropout=bpe_dropout) elif tokenizer_name == 'word': return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) return AutoTokenizer(pretrained_model_name=tokenizer_name, vocab_file=vocab_file, **special_tokens_dict, use_fast=use_fast)
def get_tokenizer( tokenizer_name: str, tokenizer_model: Optional[str] = None, vocab_file: Optional[str] = None, special_tokens: Optional[Dict[str, str]] = None, ): """ Args: tokenizer_name: sentencepiece or pretrained model from the hugging face list, for example: bert-base-cased To see the list of all HuggingFace pretrained models, use: nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list() tokenizer_model: tokenizer model file of sentencepiece special_tokens: dict of special tokens vocab_file: path to vocab file """ if special_tokens is None: special_tokens_dict = {} else: special_tokens_dict = special_tokens if 'megatron' in tokenizer_name: if vocab_file is None: vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file( tokenizer_name ) tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens ) elif tokenizer_name == 'word': return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict) elif tokenizer_name == 'char': return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict) return AutoTokenizer(pretrained_model_name=tokenizer_name, vocab_file=vocab_file, **special_tokens_dict,)