def test_errors_vectors_python(self): tokens = [] vecs = torch.empty(0, dtype=torch.float) with self.assertRaises(ValueError): # Test proper error raised when passing in empty tokens and vectors and # not passing in a user defined unk_tensor vectors(tokens, vecs) tensorA = torch.tensor([1, 0, 0], dtype=torch.int8) tokens = ['a'] vecs = tensorA.unsqueeze(0) with self.assertRaises(TypeError): # Test proper error raised when vector is not of type torch.float vectors(tokens, vecs) with tempfile.TemporaryDirectory() as dir_name: # Test proper error raised when incorrect filename or dim passed into GloVe asset_name = 'glove.6B.zip' asset_path = get_asset_path(asset_name) data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) with self.assertRaises(ValueError): # incorrect name GloVe(name='UNK', dim=50, root=dir_name, validate_file=False) with self.assertRaises(ValueError): # incorrect dim GloVe(name='6B', dim=500, root=dir_name, validate_file=False)
def test_vocab_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) f = open(asset_path, 'r') vocab_transform = VocabTransform(vocab_from_file(f)) self.assertEqual(vocab_transform(['of', 'that', 'new']), [7, 18, 24]) jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue()) self.assertEqual(jit_vocab_transform(['of', 'that', 'new']), [7, 18, 24])
def test_text_sequential_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) self.assertEqual(pipeline('of that new'), [7, 18, 24]) self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
def test_vocab_from_file(self): asset_name = 'vocab_test.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: v = vocab_from_file(f, unk_token='<new_unk>') expected_itos = ['<new_unk>', 'b', 'a', 'c'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(v.get_stoi()), expected_stoi)
def test_no_download(self): asset_name = 'glove.840B.300d.zip' asset_path = get_asset_path(asset_name) root = os.path.abspath('.data') if not os.path.exists(root): os.makedirs(root) data_path = os.path.abspath(os.path.join('.data', asset_name)) shutil.copy(asset_path, data_path) file_path = utils.download_from_url('fakedownload/glove.840B.300d.zip') self.assertEqual(file_path, data_path) conditional_remove(data_path)
def test_sentencepiece_processor(self): model_path = get_asset_path('spm_example.model') spm_transform = sentencepiece_processor(model_path) jit_spm_transform = torch.jit.script(spm_transform.to_ivalue()) test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer' ref_results = [15340, 4286, 981, 1207, 1681, 17, 84, 684, 8896, 5366, 144, 3689, 9, 5602, 12114, 6, 560, 649, 5602, 12114] self.assertEqual(spm_transform(test_sample), ref_results) self.assertEqual(jit_spm_transform(test_sample), ref_results) self.assertEqual(spm_transform.decode(ref_results), test_sample) self.assertEqual(jit_spm_transform.decode(ref_results), test_sample)
def test_errors(self): tokens = [] vectors = torch.empty(0, dtype=torch.float) with self.assertRaises(ValueError): # Test proper error raised when passing in empty tokens and vectors and # not passing in a user defined unk_tensor Vectors(tokens, vectors) tensorA = torch.tensor([1, 0, 0], dtype=torch.float) tensorB = torch.tensor([0, 1, 0], dtype=torch.float) tokens = ['a', 'b', 'c'] vectors = torch.stack(( tensorA, tensorB, ), 0) with self.assertRaises(RuntimeError): # Test proper error raised when tokens and vectors have different sizes Vectors(tokens, vectors) tensorC = torch.tensor([0, 0, 1], dtype=torch.float) tokens = ['a', 'a', 'c'] vectors = torch.stack((tensorA, tensorB, tensorC), 0) with self.assertRaises(RuntimeError): # Test proper error raised when tokens have duplicates # TODO (Nayef211): use self.assertRaisesRegex() to check # the key of the duplicate token in the error message Vectors(tokens, vectors) tensorC = torch.tensor([0, 0, 1], dtype=torch.int8) tokens = ['a'] vectors = tensorC.unsqueeze(0) with self.assertRaises(TypeError): # Test proper error raised when vector is not of type torch.float Vectors(tokens, vectors) with tempfile.TemporaryDirectory() as dir_name: # Test proper error raised when incorrect filename or dim passed into GloVe asset_name = 'glove.6B.zip' asset_path = get_asset_path(asset_name) data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) with self.assertRaises(ValueError): # incorrect name GloVe(name='UNK', dim=50, root=dir_name, validate_file=False) with self.assertRaises(ValueError): # incorrect dim GloVe(name='6B', dim=500, root=dir_name, validate_file=False)
def test_glove_different_dims(self): # copy the asset file into the expected download location # note that this is just a zip file with 1 line txt files used to test that the # correct files are being loaded asset_name = 'glove.6B.zip' asset_path = get_asset_path(asset_name) with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) glove_50d = GloVe(name='6B', dim=50, root=dir_name, validate_file=False) glove_100d = GloVe(name='6B', dim=100, root=dir_name, validate_file=False) glove_200d = GloVe(name='6B', dim=200, root=dir_name, validate_file=False) glove_300d = GloVe(name='6B', dim=300, root=dir_name, validate_file=False) vectors_objects = [glove_50d, glove_100d, glove_200d, glove_300d] # The first 3 entries in each vector. expected_glove_50d = { 'the': [0.418, 0.24968, -0.41242], } expected_glove_100d = { 'the': [-0.038194, -0.24487, 0.72812], } expected_glove_200d = { 'the': [-0.071549, 0.093459, 0.023738], } expected_glove_300d = { 'the': [0.04656, 0.21318, -0.0074364], } expected_gloves = [ expected_glove_50d, expected_glove_100d, expected_glove_200d, expected_glove_300d ] for vectors_obj, expected_glove in zip(vectors_objects, expected_gloves): for word in expected_glove.keys(): self.assertEqual(vectors_obj[word][:3], expected_glove[word])
def test_vector_transform(self): asset_name = 'wiki.en.vec' asset_path = get_asset_path(asset_name) with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) vector_transform = VectorTransform(FastText(root=dir_name, validate_file=False)) jit_vector_transform = torch.jit.script(vector_transform.to_ivalue()) # The first 3 entries in each vector. expected_fasttext_simple_en = torch.tensor([[-0.065334, -0.093031, -0.017571], [-0.32423, -0.098845, -0.0073467]]) self.assertEqual(vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en) self.assertEqual(jit_vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
def test_vectors_from_file(self): asset_name = 'vectors_test.csv' asset_path = get_asset_path(asset_name) f = open(asset_path, 'r') vectors_obj = vectors_from_file_object(f) expected_tensorA = torch.tensor([1, 0, 0], dtype=torch.float) expected_tensorB = torch.tensor([0, 1, 0], dtype=torch.float) expected_unk_tensor = torch.tensor([0, 0, 0], dtype=torch.float) self.assertEqual(vectors_obj['a'], expected_tensorA) self.assertEqual(vectors_obj['b'], expected_tensorB) self.assertEqual(vectors_obj['not_in_it'], expected_unk_tensor)
def test_vocab_from_file(self): asset_name = 'vocab_test.txt' asset_path = get_asset_path(asset_name) f = open(asset_path, 'r') v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False) expected_itos = ['a', 'b', 'c', '<unk>', '<pad>', '<eos>'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(v.get_stoi()), expected_stoi)
def test_vocab_transform(self): asset_name = 'vocab_test2.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: vocab_transform = VocabTransform(vocab_from_file(f)) self.assertEqual( vocab_transform([['of', 'that', 'new'], ['of', 'that', 'new', 'that']]), [[21, 26, 20], [21, 26, 20, 26]]) jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue()) self.assertEqual( jit_vocab_transform([['of', 'that', 'new'], ['of', 'that', 'new', 'that']]), [[21, 26, 20], [21, 26, 20, 26]])
def test_sentencepiece_tokenizer(self): model_path = get_asset_path('spm_example.model') spm_tokenizer = sentencepiece_tokenizer(model_path) jit_spm_tokenizer = torch.jit.script(spm_tokenizer.to_ivalue()) test_sample = 'SentencePiece is an unsupervised text tokenizer and detokenizer' ref_results = ['\u2581Sent', 'ence', 'P', 'ie', 'ce', '\u2581is', '\u2581an', '\u2581un', 'super', 'vis', 'ed', '\u2581text', '\u2581to', 'ken', 'izer', '\u2581and', '\u2581de', 'to', 'ken', 'izer'] self.assertEqual(spm_tokenizer(test_sample), ref_results) self.assertEqual(spm_tokenizer.decode(ref_results), test_sample) self.assertEqual(jit_spm_tokenizer(test_sample), ref_results) self.assertEqual(jit_spm_tokenizer.decode(ref_results), test_sample)
def test_vocab_from_raw_text_file(self): asset_name = 'vocab_raw_text_test.txt' asset_path = get_asset_path(asset_name) with open(asset_path, 'r') as f: tokenizer = basic_english_normalize() jit_tokenizer = torch.jit.script(tokenizer.to_ivalue()) v = vocab_from_raw_text_file(f, jit_tokenizer, unk_token='<new_unk>') expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed', 'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent', 'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner', 'unions', 'with', 'workers'] expected_stoi = {x: index for index, x in enumerate(expected_itos)} self.assertEqual(v.get_itos(), expected_itos) self.assertEqual(dict(v.get_stoi()), expected_stoi)
def test_sentencepiece_load_and_save(self): model_path = get_asset_path('spm_example.model') input = 'SentencePiece is an unsupervised text tokenizer and detokenizer' expected = [ '▁Sent', 'ence', 'P', 'ie', 'ce', '▁is', '▁an', '▁un', 'super', 'vis', 'ed', '▁text', '▁to', 'ken', 'izer', '▁and', '▁de', 'to', 'ken', 'izer', ] with self.subTest('pybind'): save_path = os.path.join(self.test_dir, 'spm_pybind.pt') spm = sentencepiece_tokenizer((model_path)) torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input)) with self.subTest('torchscript'): save_path = os.path.join(self.test_dir, 'spm_torchscript.pt') # Call the __prepare_scriptable__() func and convert the building block to the torbhind version # Not expect users to use the torchbind version on eager mode but still need a CI test here. spm = sentencepiece_tokenizer( (model_path)).__prepare_scriptable__() torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input))
def test_sentencepiece_load_and_save(self): model_path = get_asset_path('spm_example.model') input = 'SentencePiece is an unsupervised text tokenizer and detokenizer' expected = [ '▁Sent', 'ence', 'P', 'ie', 'ce', '▁is', '▁an', '▁un', 'super', 'vis', 'ed', '▁text', '▁to', 'ken', 'izer', '▁and', '▁de', 'to', 'ken', 'izer', ] with self.subTest('pybind'): save_path = os.path.join(self.test_dir, 'spm_pybind.pt') spm = sentencepiece_tokenizer((model_path)) torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input)) with self.subTest('torchscript'): save_path = os.path.join(self.test_dir, 'spm_torchscript.pt') spm = sentencepiece_tokenizer((model_path)).to_ivalue() torch.save(spm, save_path) loaded_spm = torch.load(save_path) self.assertEqual(expected, loaded_spm(input))
def test_glove(self): # copy the asset file into the expected download location # note that this is just a zip file with the first 100 entries of the GloVe 840B dataset asset_name = 'glove.840B.300d.zip' asset_path = get_asset_path(asset_name) with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) vectors_obj = GloVe(root=dir_name, validate_file=False) jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue()) # The first 3 entries in each vector. expected_glove = { 'the': [0.27204, -0.06203, -0.1884], 'people': [-0.19686, 0.11579, -0.41091], } for word in expected_glove.keys(): self.assertEqual(vectors_obj[word][:3], expected_glove[word]) self.assertEqual(jit_vectors_obj[word][:3], expected_glove[word])
def test_fast_text(self): # copy the asset file into the expected download location # note that this is just a file with the first 100 entries of the FastText english dataset asset_name = 'wiki.en.vec' asset_path = get_asset_path(asset_name) with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) vectors_obj = FastText(root=dir_name, validate_file=False) jit_vectors_obj = torch.jit.script(vectors_obj) # The first 3 entries in each vector. expected_fasttext_simple_en = { 'the': [-0.065334, -0.093031, -0.017571], 'world': [-0.32423, -0.098845, -0.0073467], } for word in expected_fasttext_simple_en.keys(): self.assertEqual(vectors_obj[word][:3], expected_fasttext_simple_en[word]) self.assertEqual(jit_vectors_obj[word][:3], expected_fasttext_simple_en[word])