Beispiel #1
0
def build_fasttext_vector_pipeline():
    tokenizer = BasicEnglishNormalize()
    vector = FastText()

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextDataPipeline(tokenizer, VectorTransform(vector))
    jit_pipeline = torch.jit.script(pipeline)
    print('jit fasttext pipeline success!')
    return pipeline, jit_pipeline
Beispiel #2
0
def build_fasttext_vector_pipeline():
    tokenizer = basic_english_normalize()
    vector = FastText()

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VectorTransform(vector))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit fasttext pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Beispiel #3
0
    def test_vector_transform(self):
        asset_name = 'wiki.en.vec'
        asset_path = get_asset_path(asset_name)

        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)
            vector_transform = VectorTransform(FastText(root=dir_name, validate_file=False))
            jit_vector_transform = torch.jit.script(vector_transform.to_ivalue())
            # The first 3 entries in each vector.
            expected_fasttext_simple_en = torch.tensor([[-0.065334, -0.093031, -0.017571], [-0.32423, -0.098845, -0.0073467]])
            self.assertEqual(vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
            self.assertEqual(jit_vector_transform(['the', 'world'])[:, 0:3], expected_fasttext_simple_en)
def benchmark_experimental_vectors():
    def _run_benchmark_lookup(tokens, vector):
        t0 = time.monotonic()
        for token in tokens:
            vector[token]
        print("Lookup time:", time.monotonic() - t0)

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens = []
    for (label, text) in train:
        for id in text.tolist():
            tokens.append(vocab.itos[id])

    # existing FastText construction
    print("FastText Existing Construction")
    t0 = time.monotonic()
    fast_text = FastText()
    print("Construction time:", time.monotonic() - t0)

    # experimental FastText construction
    print("FastText Experimental Construction")
    t0 = time.monotonic()
    fast_text_experimental = FastTextExperimental(validate_file=False)
    print("Construction time:", time.monotonic() - t0)

    # existing FastText eager lookup
    print("FastText Existing - Eager Mode")
    _run_benchmark_lookup(tokens, fast_text)

    # experimental FastText eager lookup
    print("FastText Experimental - Eager Mode")
    _run_benchmark_lookup(tokens, fast_text_experimental)

    # experimental FastText jit lookup
    print("FastText Experimental - Jit Mode")
    jit_fast_text_experimental = torch.jit.script(
        fast_text_experimental.to_ivalue())
    _run_benchmark_lookup(tokens, jit_fast_text_experimental)
Beispiel #5
0
    def test_fast_text(self):
        # copy the asset file into the expected download location
        # note that this is just a file with the first 100 entries of the FastText english dataset
        asset_name = 'wiki.en.vec'
        asset_path = get_asset_path(asset_name)

        with tempfile.TemporaryDirectory() as dir_name:
            data_path = os.path.join(dir_name, asset_name)
            shutil.copy(asset_path, data_path)
            vectors_obj = FastText(root=dir_name, validate_file=False)
            jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue())

            # The first 3 entries in each vector.
            expected_fasttext_simple_en = {
                'the': [-0.065334, -0.093031, -0.017571],
                'world': [-0.32423, -0.098845, -0.0073467],
            }

            for word in expected_fasttext_simple_en.keys():
                self.assertEqual(vectors_obj[word][:3],
                                 expected_fasttext_simple_en[word])
                self.assertEqual(jit_vectors_obj[word][:3],
                                 expected_fasttext_simple_en[word])