def build_sp_pipeline(spm_file): tokenizer = sentencepiece_tokenizer(spm_file) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab vocab.insert_token('<pad>', 1) pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit sentencepiece pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_sp_pipeline(spm_file): tokenizer = PretrainedSPTokenizer(spm_file) vocab = PretrainedSPVocab(spm_file) # Insert token in vocab to match a pretrained vocab vocab.insert_token('<pad>', 1) pipeline = TextDataPipeline(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, jit_pipeline
def build_sp_pipeline(spm_file): tokenizer = PretrainedSPTokenizer(spm_file) vocab = PretrainedSPVocab(spm_file) # Insert token in vocab to match a pretrained vocab vocab.insert_token('<pad>', 1) pipeline = TextSequentialTransforms(tokenizer, vocab, ToLongTensor()) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, jit_pipeline
def build_sp_pipeline(args): spm_file = args.spm_filename if spm_file in PRETRAINED_SP_MODEL: spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file]) tokenizer = sentencepiece_tokenizer(spm_file) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, pipeline, jit_pipeline