def create_tokenizer(tokenizer_type, model_path, vocab_path):
    if tokenizer_type == 'whitespace':
        return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
    elif tokenizer_type == 'spm':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'subword_nmt':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'yttm':
        return tokenizers.create(tokenizer_type, model_path=model_path)
    elif tokenizer_type in ['hf_bytebpe', 'hf_wordpiece', 'hf_bpe']:
        if huggingface.is_new_version_model_file(model_path):
            return tokenizers.create('hf_tokenizer',
                                     model_path=model_path,
                                     vocab=vocab_path)
        elif tokenizer_type == 'hf_bytebpe':
            return tokenizers.create(tokenizer_type,
                                     merges_file=model_path,
                                     vocab_file=vocab_path)
        elif tokenizer_type == 'hf_wordpiece':
            return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
        elif tokenizer_type == 'hf_bpe':
            return tokenizers.create(tokenizer_type,
                                     merges_file=model_path,
                                     vocab_file=vocab_path)
    else:
        raise NotImplementedError
Exemple #2
0
def create_tokenizer(tokenizer_type, model_path, vocab_path):
    if tokenizer_type == 'whitespace':
        return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
    elif tokenizer_type == 'spm':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'subword_nmt':
        return tokenizers.create(tokenizer_type,
                                 codec_path=model_path,
                                 vocab_path=vocab_path)
    elif tokenizer_type == 'yttm':
        return tokenizers.create(tokenizer_type, model_path=model_path)
    elif tokenizer_type == 'hf_bytebpe':
        return tokenizers.create(tokenizer_type,
                                 merges_file=model_path,
                                 vocab_file=vocab_path)
    elif tokenizer_type == 'hf_wordpiece':
        return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
    elif tokenizer_type == 'hf_bpe':
        return tokenizers.create(tokenizer_type,
                                 merges_file=model_path,
                                 vocab_file=vocab_path)
    else:
        raise NotImplementedError