def benchmark_basic_english_normalize(): def _run_benchmark_lookup(train, tokenizer): t0 = time.monotonic() for (_, text) in train: tokenizer(text) print("Tokenization time:", time.monotonic() - t0) existing_basic_english_tokenizer = get_tokenizer("basic_english") experimental_basic_english_normalize = basic_english_normalize() experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize.to_ivalue()) # existing eager lookup train, _ = AG_NEWS() print("BasicEnglishNormalize - Eager Mode") _run_benchmark_lookup(train, existing_basic_english_tokenizer) # experimental eager lookup train, _ = AG_NEWS() print("BasicEnglishNormalize Experimental - Eager Mode") _run_benchmark_lookup(train, experimental_basic_english_normalize) # experimental jit lookup train, _ = AG_NEWS() print("BasicEnglishNormalize Experimental - Jit Mode") _run_benchmark_lookup(train, experimental_jit_basic_english_normalize)
def legacy_vocab_from_file_object(file_like_object, **kwargs): r"""Create a `Vocab` object from a file like object. The `file_like_object` should contain tokens seperated by new lines. Note that the vocab will be created in the order that the tokens first appear in the file (and not by the frequency of tokens). Format for txt file: token1 token2 ... token_n Args: file_like_object (FileObject): a file like object to read data from. Remaining keyword arguments: Passed to the constructor of Vocab class. Returns: Vocab: a `Vocab` object. Examples: >>> from torchtext.experimental.vocab import vocab_from_file_object >>> f = open('vocab.txt', 'r') >>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False) """ tokenizer = basic_english_normalize() def tokenize(line): return tokenizer(line) def token_iterator(lines): for line in lines: for token in tokenize(line): yield token return build_vocab_from_iterator(token_iterator(file_like_object))
def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path)) jit_pipeline = torch.jit.script(pipeline) self.assertEqual(pipeline('of that new'), [7, 18, 24]) self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
def test_vocab_from_raw_text_file(self): asset_name = 'vocab_raw_text_test.txt' asset_path = get_asset_path(asset_name) def python_basic_english_normalize(input): patterns_list = [(r'\'', ' \' '), (r'\"', ''), (r'\.', ' . '), (r'<br \/>', ' '), (r',', ' , '), (r'\(', ' ( '), (r'\)', ' ) '), (r'\!', ' ! '), (r'\?', ' ? '), (r'\;', ' '), (r'\:', ' '), (r'\s+', ' ')] norm_transform = custom_replace(patterns_list) return list(norm_transform([input.lower()]))[0].split() # using python based basic_english_normalize tokenizer # we can also use basic_english_normalize() here v1 = build_vocab_from_text_file( asset_path, tokenizer=python_basic_english_normalize) expected_itos = [ "'", 'after', 'talks', '.', 'are', 'at', 'disappointed', 'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent', 'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner', 'unions', 'with', 'workers' ] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v1.get_itos(), expected_itos) self.assertEqual(dict(v1.get_stoi()), expected_stoi) # using JIT'D basic_english_normalize tokenizer v2 = build_vocab_from_text_file(asset_path, tokenizer=torch.jit.script( basic_english_normalize())) self.assertEqual(v2.get_itos(), expected_itos) self.assertEqual(dict(v2.get_stoi()), expected_stoi)
def test_BasicEnglishNormalize(self): test_sample = '\'".<br />,()!?;: Basic English Normalization for a Line of Text \'".<br />,()!?;:' ref_results = [ "'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?' ] basic_eng_norm = basic_english_normalize() experimental_eager_tokens = basic_eng_norm(test_sample) jit_basic_eng_norm = torch.jit.script(basic_eng_norm.to_ivalue()) experimental_jit_tokens = jit_basic_eng_norm(test_sample) basic_english_tokenizer = data.get_tokenizer("basic_english") eager_tokens = basic_english_tokenizer(test_sample) assert not basic_eng_norm.is_jitable assert basic_eng_norm.to_ivalue().is_jitable self.assertEqual(experimental_jit_tokens, ref_results) self.assertEqual(eager_tokens, ref_results) self.assertEqual(experimental_eager_tokens, ref_results) # test load and save save_path = os.path.join(self.test_dir, 'basic_english_normalize.pt') torch.save(basic_eng_norm.to_ivalue(), save_path) loaded_basic_eng_norm = torch.load(save_path) loaded_eager_tokens = loaded_basic_eng_norm(test_sample) self.assertEqual(loaded_eager_tokens, ref_results)
def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True, is_legacy=True, num_iters=1): f = open(vocab_file_path, 'r') t0 = time.monotonic() if is_raw_text: if is_legacy: print("Loading from raw text file with legacy python function") for _ in range(num_iters): legacy_vocab_from_file_object(f) print("Construction time:", time.monotonic() - t0) else: print( "Loading from raw text file with basic_english_normalize tokenizer" ) for _ in range(num_iters): tokenizer = basic_english_normalize() jited_tokenizer = torch.jit.script(tokenizer) build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1) print("Construction time:", time.monotonic() - t0) else: for _ in range(num_iters): load_vocab_from_file(f) print("Construction time:", time.monotonic() - t0)
def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) self.assertEqual(pipeline('of that new'), [7, 18, 24]) self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
def build_experimental_torchtext_pipeline(hf_vocab_file): tokenizer = basic_english_normalize() with open(hf_vocab_file, 'r') as f: vocab = load_vocab_from_file(f) pipeline = TextSequentialTransforms(tokenizer, vocab) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit experimental torchtext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_experimental_fasttext_vector_pipeline(): tokenizer = basic_english_normalize() vector = FastTextExperimental() pipeline = TextSequentialTransforms(tokenizer, vector) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy fasttext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_fasttext_vector_pipeline(): tokenizer = basic_english_normalize() vector = FastText() # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, VectorTransform(vector)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit fasttext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_text_vocab_pipeline(hf_vocab_file): tokenizer = basic_english_normalize() f = open(hf_vocab_file, 'r') vocab = vocab_from_file_object(f) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit text vocab pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def test_basicEnglishNormalize_load_and_save(self): test_sample = '\'".<br />,()!?;: Basic English Normalization for a Line of Text \'".<br />,()!?;:' ref_results = ["'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?'] with self.subTest('pybind'): save_path = os.path.join(self.test_dir, 'ben_pybind.pt') ben = basic_english_normalize() torch.save(ben, save_path) loaded_ben = torch.load(save_path) self.assertEqual(loaded_ben(test_sample), ref_results) with self.subTest('torchscript'): save_path = os.path.join(self.test_dir, 'ben_torchscrip.pt') # Call the __prepare_scriptable__() func and convert the building block to the torbhind version # Not expect users to use the torchbind version on eager mode but still need a CI test here. ben = basic_english_normalize().__prepare_scriptable__() torch.save(ben, save_path) loaded_ben = torch.load(save_path) self.assertEqual(loaded_ben(test_sample), ref_results)
def test_basicEnglishNormalize_load_and_save(self): test_sample = '\'".<br />,()!?;: Basic English Normalization for a Line of Text \'".<br />,()!?;:' ref_results = [ "'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?' ] with self.subTest('pybind'): save_path = os.path.join(self.test_dir, 'ben_pybind.pt') ben = basic_english_normalize() torch.save(ben, save_path) loaded_ben = torch.load(save_path) self.assertEqual(loaded_ben(test_sample), ref_results) with self.subTest('torchscript'): save_path = os.path.join(self.test_dir, 'ben_torchscrip.pt') ben = basic_english_normalize().to_ivalue() torch.save(ben, save_path) loaded_ben = torch.load(save_path) self.assertEqual(loaded_ben(test_sample), ref_results)
def test_vocab_from_raw_text_file(self): asset_name = 'vocab_raw_text_test.txt' asset_path = get_asset_path(asset_name) tokenizer = basic_english_normalize() jit_tokenizer = torch.jit.script(tokenizer) v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='<new_unk>') expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed', 'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent', 'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner', 'unions', 'with', 'workers'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(v.get_stoi()), expected_stoi)
def __init__(self, train_arrow_path='train_arrow', test_arrow_path='test_arrow', train_valid_split=0.9): super().__init__() self.train_arrow_path = train_arrow_path self.test_arrow_path = test_arrow_path self.train_valid_split = train_valid_split self.tokenizer = basic_english_normalize().to_ivalue() train_ds = ds.Dataset.load_from_disk(self.train_arrow_path) self.vocab = build_vocab_from_iterator( iter(self.tokenizer(line) for line in train_ds['texts'])).to_ivalue()
def build_pytext_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_list = [line.rstrip() for line in f] # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms( tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)), ToLongTensor()) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def build_legacy_pytext_script_vocab_pipeline(vocab_file): from pytext.torchscript.vocab import ScriptVocabulary tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True) vocab_list = [pair[0] for pair in sorted_by_freq_tuples] vocab_list.insert(0, "<unk>") pipeline = TextSequentialTransforms(tokenizer_func(tokenizer), PyTextScriptVocabTransform(ScriptVocabulary(vocab_list))) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True, num_iters=1): f = open(vocab_file_path, 'r') t0 = time.monotonic() if is_raw_text: print( "Loading from raw text file with basic_english_normalize tokenizer" ) for _ in range(num_iters): tokenizer = basic_english_normalize() jited_tokenizer = torch.jit.script(tokenizer.to_ivalue()) vocab_from_raw_text_file_object(f, jited_tokenizer) print("Construction time:", time.monotonic() - t0) else: for _ in range(num_iters): vocab_from_file_object(f) print("Construction time:", time.monotonic() - t0)
def build_experimental_pytext_script_vocab_pipeline(vocab_file): import os import sys # this is needed because we want to add 'torchtext/examples/vocab' directory to the # `sys.path` variable in order to import the pytext_vocab (since its not a module) sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "vocab")) from pytext_vocab import script_vocab tokenizer = basic_english_normalize() f = open(vocab_file, 'r') vocab_counter = Counter([token for line in f for token in line.rstrip()]) ordered_dict = OrderedDict(sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)) # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, PyTextScriptVocabTransform(script_vocab(ordered_dict))) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit legacy PyText pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def test_BasicEnglishNormalize(self): test_sample = '\'".<br />,()!?;: Basic English Normalization for a Line of Text \'".<br />,()!?;:' ref_results = ["'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?'] basic_eng_norm = basic_english_normalize() experimental_eager_tokens = basic_eng_norm(test_sample) jit_basic_eng_norm = torch.jit.script(basic_eng_norm) experimental_jit_tokens = jit_basic_eng_norm(test_sample) basic_english_tokenizer = data.get_tokenizer("basic_english") eager_tokens = basic_english_tokenizer(test_sample) assert not basic_eng_norm.is_jitable # Call the __prepare_scriptable__() func and convert the building block to the torbhind version # Not expect users to use the torchbind version on eager mode but still need a CI test here. assert basic_eng_norm.__prepare_scriptable__().is_jitable self.assertEqual(experimental_jit_tokens, ref_results) self.assertEqual(eager_tokens, ref_results) self.assertEqual(experimental_eager_tokens, ref_results)