def benchmark_basic_english_normalize(): def _run_benchmark_lookup(train, tokenizer): t0 = time.monotonic() for (_, text) in train: tokenizer(text) print("Tokenization time:", time.monotonic() - t0) existing_basic_english_tokenizer = get_tokenizer("basic_english") experimental_basic_english_normalize = basic_english_normalize() experimental_jit_basic_english_normalize = torch.jit.script(experimental_basic_english_normalize) # existing eager lookup train = AG_NEWS(split='train') print("BasicEnglishNormalize - Eager Mode") _run_benchmark_lookup(train, existing_basic_english_tokenizer) # experimental eager lookup train = AG_NEWS(split='train') print("BasicEnglishNormalize Experimental - Eager Mode") _run_benchmark_lookup(train, experimental_basic_english_normalize) # experimental jit lookup train = AG_NEWS(split='train') print("BasicEnglishNormalize Experimental - Jit Mode") _run_benchmark_lookup(train, experimental_jit_basic_english_normalize)
def setup(self, stage=None): self.data_test = list(AG_NEWS(split='test')) data_full = list(AG_NEWS(split='train')) # Train / Validation Set split threshold = round(len(data_full) * 0.8) self.data_train, self.data_val = random_split( data_full, [threshold, len(data_full) - threshold]) # Vocab and Tokenizer for data processing in collate batch self.tokenizer = get_tokenizer('basic_english') self.vocab = self.get_vocab(data_full, self.tokenizer) self.collate_batch = MyCollator(self.vocab, self.tokenizer)
def test_text_classification(self): # smoke test to ensure ag_news dataset works properly datadir = os.path.join(self.project_root, ".data") if not os.path.exists(datadir): os.makedirs(datadir) ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3) self.assertEqual(len(ag_news_train), 120000) self.assertEqual(len(ag_news_test), 7600) assert_allclose( ag_news_train[-1][1][:10], torch.tensor( [3525, 319, 4053, 34, 5407, 3607, 70, 6798, 10599, 4053]).long()) assert_allclose( ag_news_test[-1][1][:10], torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long()) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "ag_news_csv") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") conditional_remove(datafile)
def test_text_classification(self): # smoke test to ensure ag_news dataset works properly datadir = os.path.join(self.project_root, ".data") ag_news_cls = AG_NEWS(root=datadir, ngrams=3) self.assertEqual(len(ag_news_cls.train_examples), 120000) self.assertEqual(len(ag_news_cls.test_examples), 7600) # Delete the dataset after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": datafile = os.path.join(self.project_root, ".data", "AG_NEWS") conditional_remove(datafile)
def test_text_classification(self): # smoke test to ensure ag_news dataset works properly datadir = os.path.join(self.project_root, ".data") if not os.path.exists(datadir): os.makedirs(datadir) ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3) self.assertEqual(len(ag_news_train), 120000) self.assertEqual(len(ag_news_test), 7600) assert_allclose(ag_news_train[-1][1][:10], torch.tensor([3525, 319, 4053, 34, 5407, 3607, 70, 6798, 10599, 4053]).long()) assert_allclose(ag_news_test[-1][1][:10], torch.tensor([2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]).long())
def test_text_classification(self): # smoke test to ensure ag_news dataset works properly datadir = os.path.join(self.project_root, ".data") if not os.path.exists(datadir): os.makedirs(datadir) ag_news_train, ag_news_test = AG_NEWS(root=datadir, ngrams=3) self.assertEqual(len(ag_news_train), 120000) self.assertEqual(len(ag_news_test), 7600) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "ag_news_csv") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") conditional_remove(datafile)
def prepare_data(self): """ Downloads the ag_news or 20newsgroup dataset and initializes bert tokenizer """ np.random.seed(self.RANDOM_SEED) torch.manual_seed(self.RANDOM_SEED) if self.dataset == "20newsgroups": num_samples = self.args["num_samples"] self.news_group_df = ( get_20newsgroups(num_samples) if self.args["dataset"] == "20newsgroups" else get_ag_news(num_samples) ) else: train_iter, test_iter = AG_NEWS() self.train_dataset = to_map_style_dataset(train_iter) self.test_dataset = to_map_style_dataset(test_iter) self.tokenizer = BertTokenizer.from_pretrained(self.PRE_TRAINED_MODEL_NAME)
- Access to the raw data as an iterator - Build data processing pipeline to convert the raw text strings into ``torch.Tensor`` that can be used to train the model - Shuffle and iterate the data with `torch.utils.data.DataLoader <https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader>`__ """ ###################################################################### # Access to the raw dataset iterators # ----------------------------------- # # The torchtext library provides a few raw dataset iterators, which yield the raw text strings. For example, the ``AG_NEWS`` dataset iterators yield the raw data as a tuple of label and text. import torch from torchtext.datasets import AG_NEWS train_iter = AG_NEWS(split='train') ###################################################################### # :: # # next(train_iter) # >>> (3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - # Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green # again.") # # next(train_iter) # >>> (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private # investment firm Carlyle Group,\\which has a reputation for making well-timed # and occasionally\\controversial plays in the defense industry, has quietly # placed\\its bets on another part of the market.')
def get_model_params(vocab): print('Setup model params...') train_iter = AG_NEWS(root='../dataset', split='train') num_class = len(set([label for (label, text) in train_iter])) vocab_size = len(vocab) return vocab_size, EMSIZE, num_class
# %% [markdown] # # 0️⃣Access to the raw dataset iterators # Build the dataset for the text classification analysis using the torchtext library # --- # - `AG_NEWS` dataset iterators yield the raw data as a tuple of label and text # - `AG_NEWS` dataset has four labels # - 1 : World # - 2 : Sports # - 3 : Business # - 4 : Sci/Tec import torch from torchtext.datasets import AG_NEWS train_iter = AG_NEWS(root='data', split='train') print(next(train_iter)) print(next(train_iter)) print(next(train_iter)) # %% [markdown] # # 1️⃣Prepare data processing piplines # --- # - very basic components of the torchtext including vocab, word vectors, tokenizer # - build a vocabulary with the raw training dataset through factory function `build_vocab_from_iterator` which accepts iterator that yield list or iterator of tokens. And users can also pass any special symbols to be added to the vocabulary from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator tokenizer = get_tokenizer('basic_english') train_iter = AG_NEWS(root='data', split='train') def yield_tokens(data_iter): for _, text in data_iter:
def main(): num_args = len(sys.argv) # Checking if filename input is specified if num_args < 2: sys.exit("Please specify an input file") filename = str(sys.argv[1]) p = Path(filename) # Checking if filepath is valid and/or file exists if not (p.exists()): sys.exit("File not found") # Prepare data processing pipelines tokenizer = get_tokenizer('basic_english') train_iter = AG_NEWS(split='train') vocab = build_vocab_from_iterator(yield_tokens(train_iter, tokenizer), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) text_pipeline = lambda x: vocab(tokenizer(x)) label_pipeline = lambda x: int(x) - 1 # Generate data batch and iterator device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def collate_batch(batch): label_list, text_list, offsets = [], [], [0] for (_label, _text) in batch: label_list.append(label_pipeline(_label)) processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) text_list.append(processed_text) offsets.append(processed_text.size(0)) label_list = torch.tensor(label_list, dtype=torch.int64) offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) text_list = torch.cat(text_list) return label_list.to(device), text_list.to(device), offsets.to(device) # This variable needs to be initialized twice or else an IndexError occurs train_iter = AG_NEWS(split='train') dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch) # Build an instance num_class = len(set([label for (label, text) in train_iter])) vocab_size = len(vocab) emsize = 64 model = TextClassificationModel(vocab_size, emsize, num_class).to(device) # Split the dataset and run the model criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) total_accu = None train_iter, test_iter = AG_NEWS() train_dataset = to_map_style_dataset(train_iter) test_dataset = to_map_style_dataset(test_iter) num_train = int(len(train_dataset) * 0.95) split_train_, split_valid_ = \ random_split(train_dataset, [num_train, len(train_dataset) - num_train]) train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) # Run epochs for epoch in range(1, EPOCHS + 1): epoch_start_time = time.time() train(train_dataloader, model, optimizer, criterion, epoch) accu_val = evaluate(valid_dataloader, model, criterion) if total_accu is not None and total_accu > accu_val: scheduler.step() else: total_accu = accu_val print('-' * 59) print('| end of epoch {:3d} | time: {:5.2f}s | ' 'valid accuracy {:8.3f} '.format(epoch, time.time() - epoch_start_time, accu_val)) print('-' * 59) print('Checking the results of test dataset.') accu_test = evaluate(test_dataloader, model, criterion) print('test accuracy {:8.3f}'.format(accu_test)) # Run article prediction ag_news_label = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tec"} with p.open() as readfile: ex_text_str = readfile.read() model = model.to("cpu") print("This is a %s news" % ag_news_label[predict(ex_text_str, text_pipeline, model)])
learning_rate=1e-4) # USPS_data_train = USPS("./", train = True, download = True) # USPS_data_test = USPS("./", train = False, download = True) # USPS_data = ConcatDataset([USPS_data_test, USPS_data_train]) # X, y = zip(*USPS_data) # y_numpy = np.array(y[:n]) # X_numpy = np.array([np.asarray(X[i]) for i in range(n if n is not None else len(X))]) # X = torch.Tensor(X_numpy).unsqueeze(1) # which = np.random.choice(len(y_numpy), int((1-semisupervised_proportion)*len(y_numpy)), replace = False) # y_for_verification = copy.deepcopy(y_numpy) # y_numpy[which] = -1 news_train, news_test = AG_NEWS('./', ngrams=1) X, y = zip(*([item[1], item[0]] for item in news_test)) X = X[:n] y = y[:n] y_numpy = np.array(y) y_for_verification = copy.deepcopy(y_numpy) # X_numpy = np.load("shekhar_data_pca_40.npy")[:n] # y_numpy_strs = np.load("shekhar_labels.npy", allow_pickle = True)[:n] # str_to_ind = {name:i for i, name in enumerate(np.unique(y_numpy_strs))} # y_numpy = np.array([str_to_ind[name] for name in y_numpy_strs]) # X = torch.Tensor(X_numpy) # which = y_numpy < 16 # to just focus on interesting stuff # X = X[which] # y_numpy = y_numpy[which] # y_for_verification = copy.deepcopy(y_numpy)
def main_sample(): # train_iter = AG_NEWS(split='train') BATCH_SIZE = 64 # batch size for training train_iter, test_iter = AG_NEWS() train_dataset = list(train_iter) test_dataset = list(test_iter) # num_train = int(len(train_dataset)) # split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train]) train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) # valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, # shuffle=True, collate_fn=collate_batch) test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) print(train_dataset[0]) print(len(train_dataset)) # print(train_iter) # print(type[train_iter]) # print(next(train_iter)) tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking') # Tokenize input text = 'テレビでサッカーの試合を見る。' tokenized_text = tokenizer.tokenize(text) print(tokenized_text) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 2 tokenized_text[masked_index] = '[MASK]' # ['テレビ', 'で', '[MASK]', 'の', '試合', 'を', '見る', '。'] print(tokenized_text) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # [571, 12, 4, 5, 608, 11, 2867, 8] print(indexed_tokens) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) # tensor([[ 571, 12, 4, 5, 608, 11, 2867, 8]]) print(tokens_tensor) # # Load pre-trained model # model = BertForMaskedLM.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking') # model.eval() # # Predict # with torch.no_grad(): # outputs = model(tokens_tensor) # predictions = outputs[0][0, masked_index].topk(5) # 予測結果の上位5件を抽出 # # Show results # for i, index_t in enumerate(predictions.indices): # index = index_t.item() # token = tokenizer.convert_ids_to_tokens([index])[0] # print(i, token) # print(random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))[0]) from transformers import BertForSequenceClassification, Trainer, TrainingArguments model = BertForSequenceClassification.from_pretrained("bert-large-uncased") tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased", use_fast=True) training_args = TrainingArguments( output_dir='./results', # output directory num_train_epochs=3, # total # of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataloader, # training dataset eval_dataset=test_dataloader, # evaluation dataset tokenizer=tokenizer ) trainer.train() trainer.evaluate()
return total_acc / total_count if __name__ == '__main__': tokenizer, vocab = get_tokenizer_vocab() text_pipeline, label_pipeline = get_pipeline(tokenizer, vocab) vocab_size, emsize, num_class = get_model_params(vocab) model = TextClassificationModel(vocab_size, emsize, num_class).to(device) summary(model) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=LR) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1) total_accu = None train_iter, test_iter = AG_NEWS(root='../dataset') test_dataset = list(test_iter) split_train_, split_valid_ = get_train_valid_split(train_iter) train_data_loader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) valid_data_loader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) for epoch in range(1, EPOCHS + 1): epoch_start_time = time.time() train(model, train_data_loader, optimizer, criterion, epoch) accu_val = evaluate(model, valid_data_loader, criterion) if total_accu is not None and total_accu > accu_val: scheduler.step() else: total_accu = accu_val
import torch from torch import nn from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torch.utils.data import DataLoader from torchtext.datasets import AG_NEWS import time # ------------------------------------------------------------------------------ train_iter = list(AG_NEWS(split='train')) test_iter = list(AG_NEWS(split='test')) print(train_iter[0]) # ------------------------------------------------------------------------------ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = get_tokenizer('basic_english') def yield_tokens(data_iter): for _, text in data_iter: yield tokenizer(text) vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) print(vocab(['here', 'is', 'an', 'example'])) # ------------------------------------------------------------------------------ text_pipeline = lambda x: vocab(tokenizer(x)) label_pipeline = lambda x: int(x) - 1 print(text_pipeline('here is the an example')) print(label_pipeline('10'))