def build_fasttext_vector_pipeline(): tokenizer = BasicEnglishNormalize() vector = FastText() # Insert token in vocab to match a pretrained vocab pipeline = TextDataPipeline(tokenizer, VectorTransform(vector)) jit_pipeline = torch.jit.script(pipeline) print('jit fasttext pipeline success!') return pipeline, jit_pipeline
def build_fasttext_vector_pipeline(): tokenizer = basic_english_normalize() vector = FastText() # Insert token in vocab to match a pretrained vocab pipeline = TextSequentialTransforms(tokenizer, VectorTransform(vector)) jit_pipeline = torch.jit.script(pipeline.to_ivalue()) print('jit fasttext pipeline success!') return pipeline, pipeline.to_ivalue(), jit_pipeline
def test_vector_transform(self): asset_name = 'wiki.en.vec' asset_path = get_asset_path(asset_name) with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) vector_transform = VectorTransform(FastText(root=dir_name, validate_file=False)) jit_vector_transform = torch.jit.script(vector_transform.to_ivalue()) # The first 3 entries in each vector. expected_fasttext_simple_en = torch.tensor([[-0.065334, -0.093031, -0.017571], [-0.32423, -0.098845, -0.0073467]]) self.assertEqual(vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en) self.assertEqual(jit_vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
def benchmark_experimental_vectors(): def _run_benchmark_lookup(tokens, vector): t0 = time.monotonic() for token in tokens: vector[token] print("Lookup time:", time.monotonic() - t0) train, = AG_NEWS(data_select='train') vocab = train.get_vocab() tokens = [] for (label, text) in train: for id in text.tolist(): tokens.append(vocab.itos[id]) # existing FastText construction print("FastText Existing Construction") t0 = time.monotonic() fast_text = FastText() print("Construction time:", time.monotonic() - t0) # experimental FastText construction print("FastText Experimental Construction") t0 = time.monotonic() fast_text_experimental = FastTextExperimental(validate_file=False) print("Construction time:", time.monotonic() - t0) # existing FastText eager lookup print("FastText Existing - Eager Mode") _run_benchmark_lookup(tokens, fast_text) # experimental FastText eager lookup print("FastText Experimental - Eager Mode") _run_benchmark_lookup(tokens, fast_text_experimental) # experimental FastText jit lookup print("FastText Experimental - Jit Mode") jit_fast_text_experimental = torch.jit.script( fast_text_experimental.to_ivalue()) _run_benchmark_lookup(tokens, jit_fast_text_experimental)
def test_fast_text(self): # copy the asset file into the expected download location # note that this is just a file with the first 100 entries of the FastText english dataset asset_name = 'wiki.en.vec' asset_path = get_asset_path(asset_name) with tempfile.TemporaryDirectory() as dir_name: data_path = os.path.join(dir_name, asset_name) shutil.copy(asset_path, data_path) vectors_obj = FastText(root=dir_name, validate_file=False) jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue()) # The first 3 entries in each vector. expected_fasttext_simple_en = { 'the': [-0.065334, -0.093031, -0.017571], 'world': [-0.32423, -0.098845, -0.0073467], } for word in expected_fasttext_simple_en.keys(): self.assertEqual(vectors_obj[word][:3], expected_fasttext_simple_en[word]) self.assertEqual(jit_vectors_obj[word][:3], expected_fasttext_simple_en[word])