Esempio n. 1
0
def benchmark_basic_english_normalize():
    def _run_benchmark_lookup(train, tokenizer):
        t0 = time.monotonic()
        for (_, text) in train:
            tokenizer(text)
        print("Tokenization time:", time.monotonic() - t0)

    existing_basic_english_tokenizer = get_tokenizer("basic_english")
    experimental_basic_english_normalize = basic_english_normalize()
    experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize)

    # existing eager lookup
    train = AG_NEWS(split='train')
    print("BasicEnglishNormalize - Eager Mode")
    _run_benchmark_lookup(train, existing_basic_english_tokenizer)

    # experimental eager lookup
    train = AG_NEWS(split='train')
    print("BasicEnglishNormalize Experimental - Eager Mode")
    _run_benchmark_lookup(train, experimental_basic_english_normalize)

    # experimental jit lookup
    train = AG_NEWS(split='train')
    print("BasicEnglishNormalize Experimental - Jit Mode")
    _run_benchmark_lookup(train, experimental_jit_basic_english_normalize)
Esempio n. 2
0
 def setup(self, stage=None):
     self.data_test = list(AG_NEWS(split='test'))
     data_full = list(AG_NEWS(split='train'))
     # Train / Validation Set split
     threshold = round(len(data_full) * 0.8)
     self.data_train, self.data_val = random_split(
         data_full, [threshold, len(data_full) - threshold])
     # Vocab and Tokenizer for data processing in collate batch
     self.tokenizer = get_tokenizer('basic_english')
     self.vocab = self.get_vocab(data_full, self.tokenizer)
     self.collate_batch = MyCollator(self.vocab, self.tokenizer)
Esempio n. 3
0
    def test_text_classification(self):
        # smoke test to ensure ag_news dataset works properly

        datadir = os.path.join(self.project_root, ".data")
        if not os.path.exists(datadir):
            os.makedirs(datadir)
        ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3)
        self.assertEqual(len(ag_news_train), 120000)
        self.assertEqual(len(ag_news_test), 7600)
        assert_allclose(
            ag_news_train[-1][1][:10],
            torch.tensor(
                [3525, 319, 4053, 34, 5407, 3607, 70, 6798, 10599,
                 4053]).long())
        assert_allclose(
            ag_news_test[-1][1][:10],
            torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3,
                          14786]).long())

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "ag_news_csv")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "ag_news_csv.tar.gz")
        conditional_remove(datafile)
Esempio n. 4
0
    def test_text_classification(self):
        # smoke test to ensure ag_news dataset works properly

        datadir = os.path.join(self.project_root, ".data")
        ag_news_cls = AG_NEWS(root=datadir, ngrams=3)
        self.assertEqual(len(ag_news_cls.train_examples), 120000)
        self.assertEqual(len(ag_news_cls.test_examples), 7600)

        # Delete the dataset after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            datafile = os.path.join(self.project_root, ".data", "AG_NEWS")
            conditional_remove(datafile)
    def test_text_classification(self):
        # smoke test to ensure ag_news dataset works properly

        datadir = os.path.join(self.project_root, ".data")
        if not os.path.exists(datadir):
            os.makedirs(datadir)
        ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3)
        self.assertEqual(len(ag_news_train), 120000)
        self.assertEqual(len(ag_news_test), 7600)
        assert_allclose(ag_news_train[-1][1][:10],
                        torch.tensor([3525, 319, 4053, 34, 5407, 3607, 70, 6798, 10599, 4053]).long())
        assert_allclose(ag_news_test[-1][1][:10],
                        torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long())
Esempio n. 6
0
    def test_text_classification(self):
        # smoke test to ensure ag_news dataset works properly

        datadir = os.path.join(self.project_root, ".data")
        if not os.path.exists(datadir):
            os.makedirs(datadir)
        ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3)
        self.assertEqual(len(ag_news_train), 120000)
        self.assertEqual(len(ag_news_test), 7600)

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "ag_news_csv")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz")
        conditional_remove(datafile)
Esempio n. 7
0
    def prepare_data(self):
        """
        Downloads the ag_news or 20newsgroup dataset and initializes bert tokenizer
        """
        np.random.seed(self.RANDOM_SEED)
        torch.manual_seed(self.RANDOM_SEED)

        if self.dataset == "20newsgroups":
            num_samples = self.args["num_samples"]
            self.news_group_df = (
                get_20newsgroups(num_samples)
                if self.args["dataset"] == "20newsgroups"
                else get_ag_news(num_samples)
            )
        else:
            train_iter, test_iter = AG_NEWS()
            self.train_dataset = to_map_style_dataset(train_iter)
            self.test_dataset = to_map_style_dataset(test_iter)

        self.tokenizer = BertTokenizer.from_pretrained(self.PRE_TRAINED_MODEL_NAME)
   - Access to the raw data as an iterator
   - Build data processing pipeline to convert the raw text strings into ``torch.Tensor`` that can be used to train the model
   - Shuffle and iterate the data with `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__
"""


######################################################################
# Access to the raw dataset iterators
# -----------------------------------
#
# The torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the ``AG_NEWS`` dataset iterators yield the raw data as a tuple of label and text.

import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')


######################################################################
# ::
#
#     next(train_iter)
#     >>> (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - 
#     Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green 
#     again.")
# 
#     next(train_iter)
#     >>> (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private 
#     investment firm Carlyle Group,\\which has a reputation for making well-timed 
#     and occasionally\\controversial plays in the defense industry, has quietly 
#     placed\\its bets on another part of the market.')
Esempio n. 9
0
def get_model_params(vocab):
    print('Setup model params...')
    train_iter = AG_NEWS(root='../dataset', split='train')
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    return vocab_size, EMSIZE, num_class
Esempio n. 10
0
# %% [markdown]
# # 0️⃣Access to the raw dataset iterators
# Build the dataset for the text classification analysis using the torchtext library
# ---
# - `AG_NEWS` dataset iterators yield the raw data as a tuple of label and text
# - `AG_NEWS` dataset has four labels
#   - 1 : World
#   - 2 : Sports
#   - 3 : Business
#   - 4 : Sci/Tec
import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(root='data', split='train')
print(next(train_iter))
print(next(train_iter))
print(next(train_iter))

# %% [markdown]
# # 1️⃣Prepare data processing piplines
# ---
# - very basic components of the torchtext including vocab, word vectors, tokenizer
# - build a vocabulary with the raw training dataset through factory function `build_vocab_from_iterator` which accepts iterator that yield list or iterator of tokens. And users can also pass any special symbols to be added to the vocabulary
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(root='data', split='train')


def yield_tokens(data_iter):
    for _, text in data_iter:
Esempio n. 11
0
def main():

    num_args = len(sys.argv)

    # Checking if filename input is specified
    if num_args < 2:
        sys.exit("Please specify an input file")

    filename = str(sys.argv[1])
    p = Path(filename)

    # Checking if filepath is valid and/or file exists
    if not (p.exists()):
        sys.exit("File not found")

    # Prepare data processing pipelines
    tokenizer = get_tokenizer('basic_english')
    train_iter = AG_NEWS(split='train')

    vocab = build_vocab_from_iterator(yield_tokens(train_iter, tokenizer),
                                      specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1

    # Generate data batch and iterator
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def collate_batch(batch):
        label_list, text_list, offsets = [], [], [0]
        for (_label, _text) in batch:
            label_list.append(label_pipeline(_label))
            processed_text = torch.tensor(text_pipeline(_text),
                                          dtype=torch.int64)
            text_list.append(processed_text)
            offsets.append(processed_text.size(0))
        label_list = torch.tensor(label_list, dtype=torch.int64)
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
        return label_list.to(device), text_list.to(device), offsets.to(device)

    # This variable needs to be initialized twice or else an IndexError occurs
    train_iter = AG_NEWS(split='train')
    dataloader = DataLoader(train_iter,
                            batch_size=8,
                            shuffle=False,
                            collate_fn=collate_batch)

    # Build an instance
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    emsize = 64
    model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

    # Split the dataset and run the model
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    total_accu = None
    train_iter, test_iter = AG_NEWS()
    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_dataset,
        [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 collate_fn=collate_batch)

    # Run epochs
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader, model, optimizer, criterion, epoch)
        accu_val = evaluate(valid_dataloader, model, criterion)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)

    print('Checking the results of test dataset.')
    accu_test = evaluate(test_dataloader, model, criterion)
    print('test accuracy {:8.3f}'.format(accu_test))

    # Run article prediction
    ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"}

    with p.open() as readfile:
        ex_text_str = readfile.read()

    model = model.to("cpu")

    print("This is a %s news" %
          ag_news_label[predict(ex_text_str, text_pipeline, model)])
Esempio n. 12
0
File: skn.py Progetto: isaacrob/SKN
            learning_rate=1e-4)

    # USPS_data_train = USPS("./", train = True, download = True)
    # USPS_data_test = USPS("./", train = False, download = True)
    # USPS_data = ConcatDataset([USPS_data_test, USPS_data_train])
    # X, y = zip(*USPS_data)

    # y_numpy = np.array(y[:n])
    # X_numpy = np.array([np.asarray(X[i]) for i in range(n if n is not None else len(X))])
    # X = torch.Tensor(X_numpy).unsqueeze(1)

    # which = np.random.choice(len(y_numpy), int((1-semisupervised_proportion)*len(y_numpy)), replace = False)
    # y_for_verification = copy.deepcopy(y_numpy)
    # y_numpy[which] = -1

    news_train, news_test = AG_NEWS('./', ngrams=1)
    X, y = zip(*([item[1], item[0]] for item in news_test))
    X = X[:n]
    y = y[:n]
    y_numpy = np.array(y)
    y_for_verification = copy.deepcopy(y_numpy)

    # X_numpy = np.load("shekhar_data_pca_40.npy")[:n]
    # y_numpy_strs = np.load("shekhar_labels.npy", allow_pickle = True)[:n]
    # str_to_ind = {name:i for i, name in enumerate(np.unique(y_numpy_strs))}
    # y_numpy = np.array([str_to_ind[name] for name in y_numpy_strs])
    # X = torch.Tensor(X_numpy)
    # which = y_numpy < 16 # to just focus on interesting stuff
    # X = X[which]
    # y_numpy = y_numpy[which]
    # y_for_verification = copy.deepcopy(y_numpy)
Esempio n. 13
0
def main_sample():
    # train_iter = AG_NEWS(split='train')
    BATCH_SIZE = 64 # batch size for training
    train_iter, test_iter = AG_NEWS()
    train_dataset = list(train_iter)
    test_dataset = list(test_iter)
    # num_train = int(len(train_dataset))
    # split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                shuffle=True, collate_fn=collate_batch)
    # valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
    #                             shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                shuffle=True, collate_fn=collate_batch)

    print(train_dataset[0])
    print(len(train_dataset))
    # print(train_iter)
    # print(type[train_iter])
    # print(next(train_iter))


    tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

    # Tokenize input
    text = 'テレビでサッカーの試合を見る。'
    tokenized_text = tokenizer.tokenize(text)
    print(tokenized_text)

    # Mask a token that we will try to predict back with `BertForMaskedLM`
    masked_index = 2
    tokenized_text[masked_index] = '[MASK]'
    # ['テレビ', 'で', '[MASK]', 'の', '試合', 'を', '見る', '。']
    print(tokenized_text)

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # [571, 12, 4, 5, 608, 11, 2867, 8]
    print(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    # tensor([[ 571,   12,    4,    5,  608,   11, 2867,    8]])
    print(tokens_tensor)

    # # Load pre-trained model
    # model = BertForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
    # model.eval()

    # # Predict
    # with torch.no_grad():
    #     outputs = model(tokens_tensor)
    #     predictions = outputs[0][0, masked_index].topk(5) # 予測結果の上位5件を抽出

    # # Show results
    # for i, index_t in enumerate(predictions.indices):
    #     index = index_t.item()
    #     token = tokenizer.convert_ids_to_tokens([index])[0]
    #     print(i, token)

    # print(random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))[0])

    from transformers import BertForSequenceClassification, Trainer, TrainingArguments

    model = BertForSequenceClassification.from_pretrained("bert-large-uncased")
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased", use_fast=True)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=3,              # total # of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
    )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataloader,         # training dataset
        eval_dataset=test_dataloader,            # evaluation dataset
        tokenizer=tokenizer
    )

    trainer.train()
    trainer.evaluate()
Esempio n. 14
0
    return total_acc / total_count


if __name__ == '__main__':
    tokenizer, vocab = get_tokenizer_vocab()
    text_pipeline, label_pipeline = get_pipeline(tokenizer, vocab)
    vocab_size, emsize, num_class = get_model_params(vocab)
    model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

    summary(model)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    total_accu = None

    train_iter, test_iter = AG_NEWS(root='../dataset')
    test_dataset = list(test_iter)
    split_train_, split_valid_ = get_train_valid_split(train_iter)

    train_data_loader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
    valid_data_loader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
    test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(model, train_data_loader, optimizer, criterion, epoch)
        accu_val = evaluate(model, valid_data_loader, criterion)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
Esempio n. 15
0
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.datasets import AG_NEWS
import time
# ------------------------------------------------------------------------------
train_iter = list(AG_NEWS(split='train'))
test_iter = list(AG_NEWS(split='test'))
print(train_iter[0])
# ------------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = get_tokenizer('basic_english')


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab(['here', 'is', 'an', 'example']))
# ------------------------------------------------------------------------------
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1
print(text_pipeline('here is the an example'))
print(label_pipeline('10'))