Example #1
0
def create_tokenizer(tokenizer_type, model_path, vocab_path):
    if tokenizer_type == 'whitespace':
        return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
    elif tokenizer_type == 'spm':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'subword_nmt':
        return tokenizers.create(tokenizer_type,
                                 model_path=model_path,
                                 vocab=vocab_path)
    elif tokenizer_type == 'yttm':
        return tokenizers.create(tokenizer_type, model_path=model_path)
    elif tokenizer_type in ['hf_bytebpe', 'hf_wordpiece', 'hf_bpe']:
        if huggingface.is_new_version_model_file(model_path):
            return tokenizers.create('hf_tokenizer',
                                     model_path=model_path,
                                     vocab=vocab_path)
        elif tokenizer_type == 'hf_bytebpe':
            return tokenizers.create(tokenizer_type,
                                     merges_file=model_path,
                                     vocab_file=vocab_path)
        elif tokenizer_type == 'hf_wordpiece':
            return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
        elif tokenizer_type == 'hf_bpe':
            return tokenizers.create(tokenizer_type,
                                     merges_file=model_path,
                                     vocab_file=vocab_path)
    else:
        raise NotImplementedError
Example #2
0
def main(args):
    start = time.time()
    if args.model == 'spm':
        assert args.model_path is not None, 'Must specify --model_path when using the "spm" model.'
        tokenizer_model = tokenizers.create('spm',
                                            model_path=args.model_path,
                                            vocab=args.vocab_path)
    elif args.model == 'subword_nmt':
        assert args.model_path is not None,\
            'Must specify --model_path when using the "subword_nmt" model.'
        assert args.vocab_path is not None, \
            'Must specify --vocab_path when using the "subword_nmt" model.'
        tokenizer_model = tokenizers.create('subword_nmt',
                                            model_path=args.model_path,
                                            vocab=args.vocab_path,
                                            bpe_dropout=args.bpe_dropout)
    elif args.model == 'yttm':
        assert args.model_path is not None,\
            'Must specify --model_path when using the "subword_nmt" model.'
        args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout
        tokenizer_model = tokenizers.create('yttm',
                                            model_path=args.model_path,
                                            vocab=args.vocab_path,
                                            bpe_dropout=args.bpe_dropout,
                                            n_threads=1)
    elif args.model == 'hf_bytebpe' or 'hf_bpe' or 'hf_wordpiece':
        if is_new_version_model_file(args.model_path):
            assert args.model_path is not None, \
                'Must specify --model_path when using the "{}" model.'.format(args.model)
            assert args.vocab_path is not None, \
                'Must specify --vocab_path when using the "{}" model.'.format(args.model)
            tokenizer_model = tokenizers.create('hf_tokenizer',
                                                model_path=args.model_path,
                                                vocab=args.vocab_path)
        else:
            if args.model == 'hf_bytebpe':
                tokenizer_model = tokenizers.create(
                    'hf_bytebpe',
                    merges_file=args.model_path,
                    vocab_file=args.vocab_path,
                    dropout=args.bpe_dropout,
                    lowercase=args.lowercase)
            elif args.model == 'hf_wordpiece':
                tokenizer_model = tokenizers.create(
                    'hf_wordpiece',
                    vocab_file=args.vocab_path,
                    lowercase=args.lowercase,
                    strip_accents=args.strip_accents)
            elif args.model == 'hf_bpe':
                tokenizer_model = tokenizers.create(
                    'hf_bpe',
                    merges_file=args.model_path,
                    vocab_file=args.vocab_path,
                    dropout=args.bpe_dropout,
                    lowercase=args.lowercase)
    else:
        raise NotImplementedError
    print('Applying "{}" to "{}" and save to "{}"'.format(
        tokenizer_model.__class__.__name__, ', '.join(args.corpus),
        args.save_path))
    output_type = {'subword': str, 'id': int}[args.output_type]
    applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type)
    with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo:
        with Pool(args.num_process) as pool:
            sentence_count = token_count = unk_count = 0
            for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \
                enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())):
                fo.write('\n'.join(tokenized_sentences))
                fo.write('\n')
                sentence_count += sentence_num
                token_count += token_num
                unk_count += unk_num
                if (i + 1) % 100 == 0:
                    print('Chunk {} , #Lines processed: {}'.format(
                        i + 1, sentence_count))
    end = time.time()
    print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},'
          'Unknown rate {:.1f}%, Time spent {}'.format(
              sentence_count, token_count / sentence_count,
              unk_count * 100 / token_count, end - start))