Ejemplo n.º 1
0
def benchmark_experimental_vectors():
    def _run_benchmark_lookup(tokens, vector):
        t0 = time.monotonic()
        for token in tokens:
            vector[token]
        print("Lookup time:", time.monotonic() - t0)

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens = []
    for (label, text) in train:
        for id in text.tolist():
            tokens.append(vocab.itos[id])

    # existing FastText construction
    print("Existing FastText - Not Jit Mode")
    t0 = time.monotonic()
    fast_text = FastText()
    print("Construction time:", time.monotonic() - t0)
    _run_benchmark_lookup(tokens, fast_text)

    # experimental FastText construction
    print("FastText Experimental")
    t0 = time.monotonic()
    fast_text_experimental = FastTextExperimental(validate_file=False)
    print("Construction time:", time.monotonic() - t0)

    # not jit lookup
    print("FastText Experimental - Not Jit Mode")
    _run_benchmark_lookup(tokens, fast_text_experimental)

    # jit lookup
    print("FastText Experimental - Jit Mode")
    jit_fast_text_experimental = torch.jit.script(fast_text_experimental)
    _run_benchmark_lookup(tokens, jit_fast_text_experimental)
Ejemplo n.º 2
0
def benchmark_experimental_vectors():
    def _run_benchmark(tokens, vector):
        t0 = time.monotonic()
        for token in tokens:
            vector[token]
        print("Time:", time.monotonic() - t0)

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens = []
    for (label, text) in train:
        for id in text.tolist():
            tokens.append(vocab.itos[id])

    # existing FastText
    fast_text = FastText()

    print("FastText - Not Jit Mode")
    _run_benchmark(tokens, fast_text)

    # experimental FastText
    fast_text_experimental = FastTextExperimental()
    jit_fast_text_experimental = torch.jit.script(fast_text_experimental)

    print("FastText Experimental - Not Jit Mode")
    _run_benchmark(tokens, fast_text_experimental)
    print("FastText Experimental - Jit Mode")
    _run_benchmark(tokens, jit_fast_text_experimental)
Ejemplo n.º 3
0
    def test_text_classification(self):
        from torchtext.experimental.datasets import AG_NEWS
        # smoke test to ensure ag_news dataset works properly
        datadir = os.path.join(self.project_root, ".data")
        if not os.path.exists(datadir):
            os.makedirs(datadir)
        train_dataset, test_dataset = AG_NEWS(root=datadir, ngrams=3)
        self._helper_test_func(
            len(train_dataset), 120000, train_dataset[-1][1][:10],
            [3525, 319, 4053, 34, 5407, 3607, 70, 6798, 10599, 4053])
        self._helper_test_func(
            len(test_dataset), 7600, test_dataset[-1][1][:10],
            [2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786])

        # Add test for the subset of the standard datasets
        train_dataset, = AG_NEWS(data_select=('train'))
        self._helper_test_func(
            len(train_dataset), 120000, train_dataset[-1][1][:10],
            [2155, 223, 2405, 30, 3010, 2204, 54, 3603, 4930, 2405])
        train_iter, test_iter = torchtext.experimental.datasets.raw.AG_NEWS()
        self._helper_test_func(len(train_iter), 120000,
                               next(iter(train_iter))[1][:25],
                               'Wall St. Bears Claw Back ')
        self._helper_test_func(len(test_iter), 7600,
                               next(iter(test_iter))[1][:25],
                               'Fears for T N pension aft')
        del train_iter, test_iter
Ejemplo n.º 4
0
 def test_text_classification(self):
     from torchtext.experimental.datasets import AG_NEWS
     # smoke test to ensure ag_news dataset works properly
     datadir = os.path.join(self.project_root, ".data")
     if not os.path.exists(datadir):
         os.makedirs(datadir)
     train_dataset, test_dataset = AG_NEWS(root=datadir, ngrams=3)
     self._helper_test_func(len(train_dataset), 120000, train_dataset[-1][1][:10],
                            [3525, 319, 4053, 34, 5407, 3607, 70, 6798, 10599, 4053])
     self._helper_test_func(len(test_dataset), 7600, test_dataset[-1][1][:10],
                            [2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786])
     # Add test for the subset of the standard datasets
     train_dataset = AG_NEWS(split='train')
     self._helper_test_func(len(train_dataset), 120000, train_dataset[-1][1][:10],
                            [2155, 223, 2405, 30, 3010, 2204, 54, 3603, 4930, 2405])
def benchmark_experimental_vocab_lookup():
    def _run_benchmark_lookup(tokens, vocab):
        t0 = time.monotonic()
        for token in tokens:
            vocab[token]
        print("Lookup time:", time.monotonic() - t0)

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens = []
    for (label, text) in train:
        for id in text.tolist():
            tokens.append(vocab.itos[id])

    counter = Counter(tokens)
    sorted_by_freq_tuples = sorted(counter.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    # existing Vocab construction
    print("Vocab")
    t0 = time.monotonic()
    v_existing = Vocab(counter)
    print("Construction time:", time.monotonic() - t0)

    # experimental Vocab construction
    print("Vocab Experimental")
    t0 = time.monotonic()
    v_experimental = VocabExperimental(ordered_dict)
    print("Construction time:", time.monotonic() - t0)
    jit_v_experimental = torch.jit.script(v_experimental)

    # existing Vocab eager lookup
    print("Vocab - Eager Mode")
    _run_benchmark_lookup(tokens, v_existing)

    # experimental Vocab eager lookup
    print("Vocab Experimental - Eager Mode")
    _run_benchmark_lookup(tokens, v_experimental)

    # experimental Vocab jit lookup
    print("Vocab Experimental - Jit Mode")
    _run_benchmark_lookup(tokens, jit_v_experimental)
Ejemplo n.º 6
0
def benchmark_experimental_vocab_lookup(vocab_file_path=None):
    def _run_benchmark_lookup(tokens, vocab):
        t0 = time.monotonic()
        # list lookup
        if isinstance(tokens, list) and isinstance(tokens[0], list):
            for tokens_list in tokens:
                vocab.lookup_indices(tokens_list)
        # single token lookup
        elif isinstance(tokens, list):
            for token in tokens:
                vocab[token]
        else:
            raise RuntimeError("Received tokens of incorrect type {}.".format(
                type(tokens)))
        print("Lookup time:", time.monotonic() - t0)

    tokens = []
    tokens_lists = []

    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    for (_, text) in train:
        cur_tokens = []
        for id in text.tolist():
            cur_tokens.append(vocab.itos[id])
        tokens_lists.append(cur_tokens)
        tokens += cur_tokens

    if vocab_file_path:
        print("Loading Vocab from file {}".format(vocab_file_path))

        def token_iterator(file_path):
            f = open(file_path, 'r')
            for token in f:
                yield token

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = build_vocab_from_iterator(token_iterator(vocab_file_path))
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        f = open(vocab_file_path, 'r')
        v_experimental = load_vocab_from_file(f)
        print("Construction time:", time.monotonic() - t0)
    else:
        print("Loading Vocab from AG News")
        counter = Counter(tokens)
        sorted_by_freq_tuples = sorted(counter.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
        ordered_dict = OrderedDict(sorted_by_freq_tuples)

        # existing Vocab construction
        print("Vocab")
        t0 = time.monotonic()
        v_existing = Vocab(counter)
        print("Construction time:", time.monotonic() - t0)

        # experimental Vocab construction
        print("Vocab Experimental")
        t0 = time.monotonic()
        v_experimental = VocabExperimental(ordered_dict)
        print("Construction time:", time.monotonic() - t0)
    jit_v_experimental = torch.jit.script(v_experimental)

    # existing Vocab eager lookup
    print("Vocab - Eager Mode")
    _run_benchmark_lookup(tokens, v_existing)
    _run_benchmark_lookup([tokens], v_existing)
    _run_benchmark_lookup(tokens_lists, v_existing)

    # experimental Vocab eager lookup
    print("Vocab Experimental - Eager Mode")
    _run_benchmark_lookup(tokens, v_experimental)
    _run_benchmark_lookup([tokens], v_experimental)
    _run_benchmark_lookup(tokens_lists, v_experimental)

    jit_v_experimental = torch.jit.script(v_experimental)
    # experimental Vocab jit lookup
    print("Vocab Experimental - Jit Mode")
    _run_benchmark_lookup(tokens, jit_v_experimental)
    _run_benchmark_lookup([tokens], jit_v_experimental)
    _run_benchmark_lookup(tokens_lists, jit_v_experimental)
Ejemplo n.º 7
0
def benchmark_experimental_vocab():
    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens: List[str] = []
    tokens_lists: List[List[str]] = []

    for (_, text) in train:
        cur_tokens = []
        for id in text.tolist():
            cur_tokens.append(vocab.itos[id])
        tokens_lists.append(cur_tokens)
        tokens += cur_tokens

    print("Tokens size:", len(tokens))
    print("Tokens list size:", len(tokens_lists))

    counter = Counter(tokens)
    sorted_by_freq_tuples = sorted(counter.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")
    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    # pytext vocab construction
    print("Pytext Vocabulary")
    t0 = time.monotonic()
    pytext_vocab = PytextVocabulary(vocab_list)
    print("Construction time:", time.monotonic() - t0)

    # pytext ScriptVocab construction
    print("Pytext Script Vocabulary")
    t0 = time.monotonic()
    pytext_script_vocab = PytextScriptVocabulary(vocab_list)
    print("Construction time:", time.monotonic() - t0)
    jit_pytext_script_vocab = torch.jit.script(pytext_script_vocab)

    # experimental ScriptVocab construction
    print("Experimental Script Vocabulary")
    t0 = time.monotonic()
    experimental_script_vocab = ExperimentalScriptVocabulary(ordered_dict,
                                                             unk_token="<unk>")
    print("Construction time:", time.monotonic() - t0)
    jit_experimental_script_vocab = torch.jit.script(experimental_script_vocab)

    # pytext Vocab eager lookup
    print("Pytext Vocabulary - Eager Mode")
    _run_benchmark_lookup(tokens, pytext_vocab)
    _run_benchmark_lookup([tokens], pytext_vocab)
    _run_benchmark_lookup(tokens_lists, pytext_vocab)

    # pytext ScriptVocab eager lookup
    print("Pytext ScriptVocab - Eager Mode")
    _run_benchmark_lookup(tokens, pytext_script_vocab)
    _run_benchmark_lookup([tokens], pytext_script_vocab)
    _run_benchmark_lookup(tokens_lists, pytext_script_vocab)

    # experimental ScriptVocab eager lookup
    print("Experimental ScriptVocab - Eager Mode")
    _run_benchmark_lookup(tokens, experimental_script_vocab)
    _run_benchmark_lookup([tokens], experimental_script_vocab)
    _run_benchmark_lookup(tokens_lists, experimental_script_vocab)

    # pytext ScriptVocab jit lookup
    print("Pytext ScriptVocab - Jit Mode")
    _run_benchmark_lookup(tokens, jit_pytext_script_vocab)
    _run_benchmark_lookup([tokens], jit_pytext_script_vocab)
    _run_benchmark_lookup(tokens_lists, jit_pytext_script_vocab)

    # experimental ScriptVocab jit lookup
    print("Experimental ScriptVocab - Jit Mode")
    _run_benchmark_lookup(tokens, jit_experimental_script_vocab)
    _run_benchmark_lookup([tokens], jit_experimental_script_vocab)
    _run_benchmark_lookup(tokens_lists, jit_experimental_script_vocab)

    # pytext ScriptVocab JITed for loop
    print("Pytext ScriptVocab - Jit For Loop")
    _run_benchmark_lookup_jit_for_loop(tokens, jit_pytext_script_vocab)
    _run_benchmark_lookup_jit_for_loop([tokens], jit_pytext_script_vocab)
    _run_benchmark_lookup_jit_for_loop(tokens_lists, jit_pytext_script_vocab)

    # experimental ScriptVocab JITed for loop
    print("Experimental ScriptVocab - Jit For Loop")
    _run_benchmark_lookup_jit_for_loop(tokens, jit_experimental_script_vocab)
    _run_benchmark_lookup_jit_for_loop([tokens], jit_experimental_script_vocab)
    _run_benchmark_lookup_jit_for_loop(tokens_lists,
                                       jit_experimental_script_vocab)
Ejemplo n.º 8
0
import torch
import torchtext
from fastNLP.models import BertForSequenceClassification
from torchtext.experimental.datasets import AG_NEWS
from torch.utils.data import DataLoader
import time
from torch.utils.data.dataset import random_split
import argparse

train_dataset, test_dataset = AG_NEWS(ngrams=1)
NUM_LABELS = 4

parser = argparse.ArgumentParser(
    description=
    'Train a text classification model on text classification datasets.')
parser.add_argument('--batch-size',
                    type=int,
                    default=16,
                    help='batch size (default=16)')
parser.add_argument('--embed-dim',
                    type=int,
                    default=32,
                    help='embed dim. (default=32)')
parser.add_argument('--epochs',
                    type=int,
                    default=5,
                    help='num epochs (default=5)')
parser.add_argument('--torchscript',
                    type=bool,
                    default=False,
                    help='torchscript the model')