def build_experimental_torchtext_pipeline(hf_vocab_file): tokenizer = basic_english_normalize() with open(hf_vocab_file, 'r') as f: vocab = load_vocab_from_file(f) pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit experimental torchtext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) self.assertEqual(pipeline('of that new'), [7, 18, 24]) self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
def build_experimental_fasttext_vector_pipeline(): tokenizer = basic_english_normalize() vector = FastTextExperimental() pipeline = TextSequentialTransforms(tokenizer, vector) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy fasttext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_fasttext_vector_pipeline(): tokenizer = basic_english_normalize() vector = FastText() # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, VectorTransform(vector)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit fasttext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_sp_pipeline(spm_file): tokenizer = sentencepiece_tokenizer(spm_file) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab vocab.insert_token('<pad>', 1) pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit sentencepiece pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_text_vocab_pipeline(hf_vocab_file): tokenizer = basic_english_normalize() f = open(hf_vocab_file, 'r') vocab = vocab_from_file_object(f) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit text vocab pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_pytext_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_list = [line.rstrip() for line in f] # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms( tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path)) jit_pipeline = torch.jit.script(pipeline) self.assertEqual(pipeline('of that new'), [7, 18, 24]) self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
def build_legacy_pytext_script_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = TextSequentialTransforms(tokenizer_func(tokenizer), PyTextScriptVocabTransform(ScriptVocabulary(vocab_list))) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_experimental_pytext_script_vocab_pipeline(vocab_file): import os import sys # this is needed because we want to add 'torchtext/examples/vocab' directory to the # `sys.path` variable in order to import the pytext_vocab (since its not a module) sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "vocab")) from pytext_vocab import script_vocab tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) ordered_dict = OrderedDict(sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, PyTextScriptVocabTransform(script_vocab(ordered_dict))) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_sp_pipeline(spm_file): tokenizer = PretrainedSPTokenizer(spm_file) vocab = PretrainedSPVocab(spm_file) # Insert token in vocab to match a pretrained vocab vocab.insert_token('<pad>', 1) pipeline = TextSequentialTransforms(tokenizer, vocab, ToLongTensor()) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, jit_pipeline
def build_sp_pipeline(args): spm_file = args.spm_filename if spm_file in PRETRAINED_SP_MODEL: spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file]) tokenizer = sentencepiece_tokenizer(spm_file) vocab = PretrainedSPVocab(load_sp_model(spm_file)) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline) print('jit sentencepiece pipeline success!') return pipeline, pipeline, jit_pipeline