def build_sp_pipeline(spm_file): tokenizer = PretrainedSPTokenizer(load_sp_model(spm_file)) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab vocab.insert_token('<pad>', 1) pipeline = TextSequentialTransforms(tokenizer, vocab, ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit sentencepiece pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_text_vocab_pipeline(hf_vocab_file): tokenizer = basic_english_normalize() f = open(hf_vocab_file, 'r') vocab = vocab_from_file_object(f) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit text vocab pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_pytext_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_list = [line.rstrip() for line in f] # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms( tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline