def benchmark_basic_english_normalize():
    def _run_benchmark_lookup(train, tokenizer):
        t0 = time.monotonic()
        for (_, text) in train:
            tokenizer(text)
        print("Tokenization time:", time.monotonic() - t0)

    existing_basic_english_tokenizer = get_tokenizer("basic_english")
    experimental_basic_english_normalize = basic_english_normalize()
    experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize.to_ivalue())

    # existing eager lookup
    train, _ = AG_NEWS()
    print("BasicEnglishNormalize - Eager Mode")
    _run_benchmark_lookup(train, existing_basic_english_tokenizer)

    # experimental eager lookup
    train, _ = AG_NEWS()
    print("BasicEnglishNormalize Experimental - Eager Mode")
    _run_benchmark_lookup(train, experimental_basic_english_normalize)

    # experimental jit lookup
    train, _ = AG_NEWS()
    print("BasicEnglishNormalize Experimental - Jit Mode")
    _run_benchmark_lookup(train, experimental_jit_basic_english_normalize)
Ejemplo n.º 2
0
def legacy_vocab_from_file_object(file_like_object, **kwargs):
    r"""Create a `Vocab` object from a file like object.

    The `file_like_object` should contain tokens seperated by new lines. Note that the vocab
    will be created in the order that the tokens first appear in the file (and not by the frequency of tokens).

    Format for txt file:
        token1
        token2
        ...
        token_n

    Args:
        file_like_object (FileObject): a file like object to read data from.
        Remaining keyword arguments: Passed to the constructor of Vocab class.

    Returns:
        Vocab: a `Vocab` object.

    Examples:
        >>> from torchtext.experimental.vocab import vocab_from_file_object
        >>> f = open('vocab.txt', 'r')
        >>> v = vocab_from_file_object(f, specials=('<unk>', '<pad>', '<eos>'), specials_first=False)
    """
    tokenizer = basic_english_normalize()

    def tokenize(line):
        return tokenizer(line)

    def token_iterator(lines):
        for line in lines:
            for token in tokenize(line):
                yield token

    return build_vocab_from_iterator(token_iterator(file_like_object))
Ejemplo n.º 3
0
 def test_text_sequential_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(asset_path))
     jit_pipeline = torch.jit.script(pipeline)
     self.assertEqual(pipeline('of that new'), [7, 18, 24])
     self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
Ejemplo n.º 4
0
    def test_vocab_from_raw_text_file(self):
        asset_name = 'vocab_raw_text_test.txt'
        asset_path = get_asset_path(asset_name)

        def python_basic_english_normalize(input):
            patterns_list = [(r'\'', ' \'  '), (r'\"', ''), (r'\.', ' . '),
                             (r'<br \/>', ' '), (r',', ' , '), (r'\(', ' ( '),
                             (r'\)', ' ) '), (r'\!', ' ! '), (r'\?', ' ? '),
                             (r'\;', ' '), (r'\:', ' '), (r'\s+', ' ')]
            norm_transform = custom_replace(patterns_list)
            return list(norm_transform([input.lower()]))[0].split()

        # using python based basic_english_normalize tokenizer
        # we can also use basic_english_normalize() here
        v1 = build_vocab_from_text_file(
            asset_path, tokenizer=python_basic_english_normalize)
        expected_itos = [
            "'", 'after', 'talks', '.', 'are', 'at', 'disappointed', 'fears',
            'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
            'pension', 'representing', 'say', 'stricken', 't', 'they',
            'turner', 'unions', 'with', 'workers'
        ]
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}
        self.assertEqual(v1.get_itos(), expected_itos)
        self.assertEqual(dict(v1.get_stoi()), expected_stoi)

        # using JIT'D basic_english_normalize tokenizer
        v2 = build_vocab_from_text_file(asset_path,
                                        tokenizer=torch.jit.script(
                                            basic_english_normalize()))
        self.assertEqual(v2.get_itos(), expected_itos)
        self.assertEqual(dict(v2.get_stoi()), expected_stoi)
Ejemplo n.º 5
0
    def test_BasicEnglishNormalize(self):
        test_sample = '\'".<br />,()!?;:   Basic English Normalization for a Line of Text   \'".<br />,()!?;:'
        ref_results = [
            "'", '.', ',', '(', ')', '!', '?', 'basic', 'english',
            'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',',
            '(', ')', '!', '?'
        ]

        basic_eng_norm = basic_english_normalize()
        experimental_eager_tokens = basic_eng_norm(test_sample)

        jit_basic_eng_norm = torch.jit.script(basic_eng_norm.to_ivalue())
        experimental_jit_tokens = jit_basic_eng_norm(test_sample)

        basic_english_tokenizer = data.get_tokenizer("basic_english")
        eager_tokens = basic_english_tokenizer(test_sample)

        assert not basic_eng_norm.is_jitable
        assert basic_eng_norm.to_ivalue().is_jitable

        self.assertEqual(experimental_jit_tokens, ref_results)
        self.assertEqual(eager_tokens, ref_results)
        self.assertEqual(experimental_eager_tokens, ref_results)

        # test load and save
        save_path = os.path.join(self.test_dir, 'basic_english_normalize.pt')
        torch.save(basic_eng_norm.to_ivalue(), save_path)
        loaded_basic_eng_norm = torch.load(save_path)

        loaded_eager_tokens = loaded_basic_eng_norm(test_sample)
        self.assertEqual(loaded_eager_tokens, ref_results)
Ejemplo n.º 6
0
def benchmark_experimental_vocab_construction(vocab_file_path,
                                              is_raw_text=True,
                                              is_legacy=True,
                                              num_iters=1):
    f = open(vocab_file_path, 'r')
    t0 = time.monotonic()
    if is_raw_text:
        if is_legacy:
            print("Loading from raw text file with legacy python function")
            for _ in range(num_iters):
                legacy_vocab_from_file_object(f)

            print("Construction time:", time.monotonic() - t0)
        else:
            print(
                "Loading from raw text file with basic_english_normalize tokenizer"
            )
            for _ in range(num_iters):
                tokenizer = basic_english_normalize()
                jited_tokenizer = torch.jit.script(tokenizer)
                build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1)
            print("Construction time:", time.monotonic() - t0)
    else:
        for _ in range(num_iters):
            load_vocab_from_file(f)
        print("Construction time:", time.monotonic() - t0)
Ejemplo n.º 7
0
 def test_text_sequential_transform(self):
     asset_name = 'vocab_test2.txt'
     asset_path = get_asset_path(asset_name)
     with open(asset_path, 'r') as f:
         pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f))
         jit_pipeline = torch.jit.script(pipeline.to_ivalue())
         self.assertEqual(pipeline('of that new'), [7, 18, 24])
         self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
Ejemplo n.º 8
0
def build_experimental_torchtext_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    with open(hf_vocab_file, 'r') as f:
        vocab = load_vocab_from_file(f)
        pipeline = TextSequentialTransforms(tokenizer, vocab)
        jit_pipeline = torch.jit.script(pipeline.to_ivalue())
        print('jit experimental torchtext pipeline success!')
        return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 9
0
def build_experimental_fasttext_vector_pipeline():
    tokenizer = basic_english_normalize()
    vector = FastTextExperimental()

    pipeline = TextSequentialTransforms(tokenizer, vector)
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())

    print('jit legacy fasttext pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 10
0
def build_fasttext_vector_pipeline():
    tokenizer = basic_english_normalize()
    vector = FastText()

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VectorTransform(vector))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit fasttext pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 11
0
def build_text_vocab_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    f = open(hf_vocab_file, 'r')
    vocab = vocab_from_file_object(f)

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit text vocab pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 12
0
    def test_basicEnglishNormalize_load_and_save(self):
        test_sample = '\'".<br />,()!?;:   Basic English Normalization for a Line of Text   \'".<br />,()!?;:'
        ref_results = ["'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization',
                       'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?']

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'ben_pybind.pt')
            ben = basic_english_normalize()
            torch.save(ben, save_path)
            loaded_ben = torch.load(save_path)
            self.assertEqual(loaded_ben(test_sample), ref_results)

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'ben_torchscrip.pt')
            # Call the __prepare_scriptable__() func and convert the building block to the torbhind version
            # Not expect users to use the torchbind version on eager mode but still need a CI test here.
            ben = basic_english_normalize().__prepare_scriptable__()
            torch.save(ben, save_path)
            loaded_ben = torch.load(save_path)
            self.assertEqual(loaded_ben(test_sample), ref_results)
Ejemplo n.º 13
0
    def test_basicEnglishNormalize_load_and_save(self):
        test_sample = '\'".<br />,()!?;:   Basic English Normalization for a Line of Text   \'".<br />,()!?;:'
        ref_results = [
            "'", '.', ',', '(', ')', '!', '?', 'basic', 'english',
            'normalization', 'for', 'a', 'line', 'of', 'text', "'", '.', ',',
            '(', ')', '!', '?'
        ]

        with self.subTest('pybind'):
            save_path = os.path.join(self.test_dir, 'ben_pybind.pt')
            ben = basic_english_normalize()
            torch.save(ben, save_path)
            loaded_ben = torch.load(save_path)
            self.assertEqual(loaded_ben(test_sample), ref_results)

        with self.subTest('torchscript'):
            save_path = os.path.join(self.test_dir, 'ben_torchscrip.pt')
            ben = basic_english_normalize().to_ivalue()
            torch.save(ben, save_path)
            loaded_ben = torch.load(save_path)
            self.assertEqual(loaded_ben(test_sample), ref_results)
Ejemplo n.º 14
0
 def test_vocab_from_raw_text_file(self):
     asset_name = 'vocab_raw_text_test.txt'
     asset_path = get_asset_path(asset_name)
     tokenizer = basic_english_normalize()
     jit_tokenizer = torch.jit.script(tokenizer)
     v = build_vocab_from_text_file(asset_path, jit_tokenizer, unk_token='<new_unk>')
     expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
                      'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
                      'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
                      'unions', 'with', 'workers']
     expected_stoi = {x: index for index, x in enumerate(expected_itos)}
     self.assertEqual(v.get_itos(), expected_itos)
     self.assertEqual(dict(v.get_stoi()), expected_stoi)
Ejemplo n.º 15
0
 def __init__(self,
              train_arrow_path='train_arrow',
              test_arrow_path='test_arrow',
              train_valid_split=0.9):
     super().__init__()
     self.train_arrow_path = train_arrow_path
     self.test_arrow_path = test_arrow_path
     self.train_valid_split = train_valid_split
     self.tokenizer = basic_english_normalize().to_ivalue()
     train_ds = ds.Dataset.load_from_disk(self.train_arrow_path)
     self.vocab = build_vocab_from_iterator(
         iter(self.tokenizer(line)
              for line in train_ds['texts'])).to_ivalue()
Ejemplo n.º 16
0
def build_pytext_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary
    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(
        tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)),
        ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 17
0
def build_legacy_pytext_script_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary

    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = TextSequentialTransforms(tokenizer_func(tokenizer),
                                        PyTextScriptVocabTransform(ScriptVocabulary(vocab_list)))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit legacy PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 18
0
def benchmark_experimental_vocab_construction(vocab_file_path,
                                              is_raw_text=True,
                                              num_iters=1):
    f = open(vocab_file_path, 'r')
    t0 = time.monotonic()
    if is_raw_text:
        print(
            "Loading from raw text file with basic_english_normalize tokenizer"
        )
        for _ in range(num_iters):
            tokenizer = basic_english_normalize()
            jited_tokenizer = torch.jit.script(tokenizer.to_ivalue())
            vocab_from_raw_text_file_object(f, jited_tokenizer)
        print("Construction time:", time.monotonic() - t0)
    else:
        for _ in range(num_iters):
            vocab_from_file_object(f)
        print("Construction time:", time.monotonic() - t0)
Ejemplo n.º 19
0
def build_experimental_pytext_script_vocab_pipeline(vocab_file):
    import os
    import sys
    # this is needed because we want to add 'torchtext/examples/vocab' directory to the
    # `sys.path` variable in order to import the pytext_vocab (since its not a module)
    sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "vocab"))
    from pytext_vocab import script_vocab

    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    ordered_dict = OrderedDict(sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True))

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer,
                                        PyTextScriptVocabTransform(script_vocab(ordered_dict)))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit legacy PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Ejemplo n.º 20
0
    def test_BasicEnglishNormalize(self):
        test_sample = '\'".<br />,()!?;:   Basic English Normalization for a Line of Text   \'".<br />,()!?;:'
        ref_results = ["'", '.', ',', '(', ')', '!', '?', 'basic', 'english', 'normalization',
                       'for', 'a', 'line', 'of', 'text', "'", '.', ',', '(', ')', '!', '?']

        basic_eng_norm = basic_english_normalize()
        experimental_eager_tokens = basic_eng_norm(test_sample)

        jit_basic_eng_norm = torch.jit.script(basic_eng_norm)
        experimental_jit_tokens = jit_basic_eng_norm(test_sample)

        basic_english_tokenizer = data.get_tokenizer("basic_english")
        eager_tokens = basic_english_tokenizer(test_sample)

        assert not basic_eng_norm.is_jitable
        # Call the __prepare_scriptable__() func and convert the building block to the torbhind version
        # Not expect users to use the torchbind version on eager mode but still need a CI test here.
        assert basic_eng_norm.__prepare_scriptable__().is_jitable

        self.assertEqual(experimental_jit_tokens, ref_results)
        self.assertEqual(eager_tokens, ref_results)
        self.assertEqual(experimental_eager_tokens, ref_results)