def create_tokenizer(tokenizer_type, model_path, vocab_path): if tokenizer_type == 'whitespace': return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path)) elif tokenizer_type == 'spm': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'subword_nmt': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'yttm': return tokenizers.create(tokenizer_type, model_path=model_path) elif tokenizer_type in ['hf_bytebpe', 'hf_wordpiece', 'hf_bpe']: if huggingface.is_new_version_model_file(model_path): return tokenizers.create('hf_tokenizer', model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'hf_bytebpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) elif tokenizer_type == 'hf_wordpiece': return tokenizers.create(tokenizer_type, vocab_file=vocab_path) elif tokenizer_type == 'hf_bpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) else: raise NotImplementedError
def create_tokenizer(tokenizer_type, model_path, vocab_path): if tokenizer_type == 'whitespace': return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path)) elif tokenizer_type == 'spm': return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path) elif tokenizer_type == 'subword_nmt': return tokenizers.create(tokenizer_type, codec_path=model_path, vocab_path=vocab_path) elif tokenizer_type == 'yttm': return tokenizers.create(tokenizer_type, model_path=model_path) elif tokenizer_type == 'hf_bytebpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) elif tokenizer_type == 'hf_wordpiece': return tokenizers.create(tokenizer_type, vocab_file=vocab_path) elif tokenizer_type == 'hf_bpe': return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path) else: raise NotImplementedError