Esempio n. 1
0
def benchmark_experimental_vocab_construction(vocab_file_path,
                                              is_raw_text=True,
                                              num_iters=1):
    f = open(vocab_file_path, 'r')
    t0 = time.monotonic()
    if is_raw_text:
        print(
            "Loading from raw text file with basic_english_normalize tokenizer"
        )
        for _ in range(num_iters):
            tokenizer = basic_english_normalize()
            jited_tokenizer = torch.jit.script(tokenizer.to_ivalue())
            vocab_from_raw_text_file_object(f, jited_tokenizer)
        print("Construction time:", time.monotonic() - t0)
    else:
        for _ in range(num_iters):
            vocab_from_file_object(f)
        print("Construction time:", time.monotonic() - t0)
Esempio n. 2
0
def build_huggingface_vocab_pipeline(hf_vocab_file):
    tokenizer = BasicEnglishNormalize()
    f = open(hf_vocab_file, 'r')
    vocab = vocab_from_file_object(f)

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextDataPipeline(tokenizer, VocabTransform(vocab))
    jit_pipeline = torch.jit.script(pipeline)
    print('jit Hugging Face pipeline success!')
    return pipeline, jit_pipeline
Esempio n. 3
0
def build_text_vocab_pipeline(hf_vocab_file):
    tokenizer = basic_english_normalize()
    f = open(hf_vocab_file, 'r')
    vocab = vocab_from_file_object(f)

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(tokenizer, VocabTransform(vocab), ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit text vocab pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Esempio n. 4
0
    def test_vocab_from_file(self):
        asset_name = 'vocab_test.txt'
        asset_path = get_asset_path(asset_name)
        f = open(asset_path, 'r')
        v = vocab_from_file_object(f, unk_token='<new_unk>')

        expected_itos = ['<new_unk>', 'a', 'b', 'c']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}

        self.assertEqual(v.get_itos(), expected_itos)
        self.assertEqual(dict(v.get_stoi()), expected_stoi)
Esempio n. 5
0
    def test_vocab_from_file(self):
        asset_name = 'vocab_test.txt'
        asset_path = get_asset_path(asset_name)
        f = open(asset_path, 'r')
        v = vocab_from_file_object(f,
                                   specials=('<unk>', '<pad>', '<eos>'),
                                   specials_first=False)

        expected_itos = ['a', 'b', 'c', '<unk>', '<pad>', '<eos>']
        expected_stoi = {x: index for index, x in enumerate(expected_itos)}

        self.assertEqual(v.get_itos(), expected_itos)
        self.assertEqual(dict(v.get_stoi()), expected_stoi)
Esempio n. 6
0
        def getVocab(tokenizer):
            tokenizer.save_pretrained("./")

            token_old = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
            token_new = ['<pad>', '<unk>', '<cls>', '<sep>', '<mask>']

            fin = open("vocab.txt", "rt")
            data = fin.read()
            for old, new in zip(token_old, token_new):
                data = data.replace(old, new)
            fin.close()

            fin = open("vocab_adapted.txt", "wt")
            fin.write(data)
            fin.close()

            f = open('vocab_adapted.txt', 'r')
            v = vocab_from_file_object(f)
            return v
Esempio n. 7
0
def benchmark_experimental_vocab_lookup(vocab_file_path=None):
    def _run_benchmark_lookup(tokens, vocab):
        t0 = time.monotonic()
        # list lookup
        if isinstance(tokens, list) and isinstance(tokens[0], list):
            for tokens_list in tokens:
                vocab.lookup_indices(tokens_list)
        # single token lookup
        elif isinstance(tokens, list):
            for token in tokens:
                vocab[token]
        else:
            raise RuntimeError("Received tokens of incorrect type {}.".format(
                type(tokens)))
        print("Lookup time:", time.monotonic() - t0)

    tokens = []
    tokens_lists = []

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    for (_, text) in train:
        cur_tokens = []
        for id in text.tolist():
            cur_tokens.append(vocab.itos[id])
        tokens_lists.append(cur_tokens)
        tokens += cur_tokens

    if vocab_file_path:
        print("Loading Vocab from file {}".format(vocab_file_path))

        def token_iterator(file_path):
            f = open(file_path, 'r')
            for token in f:
                yield token

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = build_vocab_from_iterator(token_iterator(vocab_file_path))
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        f = open(vocab_file_path, 'r')
        v_experimental = vocab_from_file_object(f)
        print("Construction time:", time.monotonic() - t0)
    else:
        print("Loading Vocab from AG News")
        counter = Counter(tokens)
        sorted_by_freq_tuples = sorted(counter.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
        ordered_dict = OrderedDict(sorted_by_freq_tuples)

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = Vocab(counter)
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        v_experimental = VocabExperimental(ordered_dict)
        print("Construction time:", time.monotonic() - t0)
    jit_v_experimental = torch.jit.script(v_experimental)

    # existing Vocab eager lookup
    print("Vocab - Eager Mode")
    _run_benchmark_lookup(tokens, v_existing)
    _run_benchmark_lookup([tokens], v_existing)
    _run_benchmark_lookup(tokens_lists, v_existing)

    # experimental Vocab eager lookup
    print("Vocab Experimental - Eager Mode")
    _run_benchmark_lookup(tokens, v_experimental)
    _run_benchmark_lookup([tokens], v_experimental)
    _run_benchmark_lookup(tokens_lists, v_experimental)

    jit_v_experimental = torch.jit.script(v_experimental.to_ivalue())
    # experimental Vocab jit lookup
    print("Vocab Experimental - Jit Mode")
    _run_benchmark_lookup(tokens, jit_v_experimental)
    _run_benchmark_lookup([tokens], jit_v_experimental)
    _run_benchmark_lookup(tokens_lists, jit_v_experimental)
Esempio n. 8
0
def benchmark_experimental_vocab_construction(vocab_file_path, num_iters=100):
    f = open(vocab_file_path, 'r')
    t0 = time.monotonic()
    for _ in range(num_iters):
        vocab_from_file_object(f)
    print("Construction time:", time.monotonic() - t0)