def create_datasets(self): field = Field(tokenize=list) train, val, test = WikiText2.splits(field, root='wikitext2_data') field.build_vocab(train, vectors=None) trains, vals, _ = BPTTIterator.splits((train, val, test), batch_size=self.args.batch, bptt_len=self.args.bptt_len, device=torch.device('cpu')) return trains, vals
def WikiTexts(batch_size=32, bptt=30, vectors="glove.6B.100d"): my_tok = spacy.load('en') #my_tok.tokenizer.add_special_case('<eos>', [{ORTH: '<eos>'}]) #my_tok.tokenizer.add_special_case('<bos>', [{ORTH: '<bos>'}]) #my_tok.tokenizer.add_special_case('<unk>', [{ORTH: '<unk>'}]) TEXT = data.Field(lower=True, tokenize=spacy_tok) train, valid, test = WikiText2.splits(TEXT) TEXT.build_vocab(train, vectors=vectors) train_loader, val_loader, test_loader = data.BPTTIterator.splits( (train, valid, test), batch_size=batch_size, bptt_len=bptt, # this is where we specify the sequence length #device=(0 if USE_GPU else -1), repeat=False) return train_loader, val_loader, test_loader, TEXT
def get_data(self): ''' Retrieves data in a format that can be used in training by loading in batches. Returns ------- obj Object loaded with language data. obj Torchtext data iterator. int Vocab size in the text dataset. obj Field object from Torchtext. obj Vocabulary taken from Torchtext Field. ''' TEXT = Field(tokenize=self.tokenizer, lower=True) train, valid, test = WikiText2.splits(TEXT) TEXT.build_vocab() vocab_size = len(TEXT.vocab) train_iter, valid_iter = BPTTIterator.splits( (train, valid), batch_size=self.config.batch_size, bptt_len=8, device=self.device, repeat=False) train_loader = Batch(dl=train_iter, x_var='text') valid_loader = Batch(dl=valid_iter, x_var='text') print(len(train_loader)) data_dict = edict({ 'train_loader': train_loader, 'valid_loader': valid_loader, 'train_iter': train_iter, 'vocab_size': vocab_size, 'vocab': TEXT.vocab }) return data_dict
def evaluate_lm(model_path): """ Evaluate language model against Wiki2 Arguments --------- model_path: string Can be "RNN", "QRNN" """ device = "cuda" if torch.cuda.is_available() else "cpu" model, TEXT = load_model(model_path, device) train, valid, test = WikiText2.splits(TEXT) BATCH_SIZE = 32 BPTT_LEN = 30 train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, bptt_len=BPTT_LEN, # this is where we specify the sequence length device=device, repeat=False) criterion = nn.CrossEntropyLoss() model.eval() valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion) test_loss, test_perplexity = evaluate(model, test_iter, criterion) print(f"Valid loss : {valid_loss:.3f}") print(f"Valid perplexity: {valid_perplexity:.2f}\n") print(f"Test loss : {test_loss:.3f}") print(f"Test perplexity: {test_perplexity:.2f}")
def main(args): if args.device: device = args.device else: device = 'cuda' if torch.cuda.is_available() else 'cpu' text_field = data.Field(tokenize=list) datasets = WikiText2.splits(text_field) text_field.build_vocab(datasets[0]) train_iter, test_iter, val_iter = data.BPTTIterator.splits(datasets, batch_size=32, bptt_len=512, device=device) vocab = text_field.vocab print(f'Vocab size: {len(vocab)}') model_args = dict(rnn_type='lstm', ntoken=args.num_latents, ninp=256, nhid=1024, nlayers=2) if args.model_args: model_args.update(dict(eval(args.model_args))) model = SHARNN(**model_args).to(device) model.train() criterion = nn.NLLLoss() #optim = torch.optim.SGD(model.parameters(), lr=5.0) optim = torch.optim.Adam(model.parameters(), lr=2e-3) for epoch in range(10): hidden = None mems = None total_loss = 0 for step, batch in enumerate(train_iter): optim.zero_grad() if hidden is not None: hidden = repackage_hidden(hidden) if mems is not None: mems = repackage_hidden(mems) output, hidden, mems, attn_outs, _ = model(batch.text, hidden, return_h=True, mems=mems) logits = model.decoder(output) logits = F.log_softmax(logits, dim=-1) assert logits.size(1) == batch.target.size(1) loss = criterion(logits.view(-1, logits.size(-1)), batch.target.view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optim.step() total_loss += loss.data if step % args.log_interval == 0 and step > 0: cur_loss = total_loss / args.log_interval print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, step, len(train_iter), optim.param_groups[0]['lr'], cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0
def segment(doc): """ 用 Spacy 库做分词, 将一段文档切割到若干词汇. """ tokenizer = tokenize.tokenizer return [token.text for token in tokenizer(doc)] # 定义特征域, 表示一段文本, 要求按规则分词并小写化预处理数据集. TEXT = data.Field(lower=True, tokenize=segment) # datasets 中存在一些准备好的数据集, 例如下面的 WikiText2, 另外这个 # 命令会在项目目录下自动创建目录 .data 并下载数据 (4.4M), 当然为了能 # 减少读者的疑惑, 在 data 文件夹下 copy 了一份相同的. train_set, valid_set, test_set = WikiText2.splits(TEXT) # 下面看看 train/valid/test 分别有多少条数据在其中 (没分词). print(len(train_set), len(valid_set), len(test_set), end="\n\n") # 在构建数据集的同时也可以加入预训练的词向量, 当然这里注释掉了. TEXT.build_vocab(train_set) # vectors="data/glove.6B.200d" # 语言模型的核心便是 Iterator, 有子类为 BPTTIterator. 其特殊功能便 # 是将文本连续地切成一段段等长的序列并做 batch, 称为 bbpt, 例如: # # "Machine learning is a field of computer science # that gives computers the ability to learn without # being explicitly programmed" # # 如果规定连续切割长度为 5, 则上述文本会生成一下列表:
loss = criterion(outs.view(-1, outs.size(-1)), targets.view(-1)) epoch_loss += loss.item() return epoch_loss / len(devLoader) ############################################################################### # Load data ############################################################################### configfile = open('./config.yaml') config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader)) device = torch.device(args.device) # ? include lenghts TEXT = Field(lower=True, include_lengths=False, batch_first=False) # TEXT: split string into tokens trainSet, devSet, testSet = WikiText2.splits(text_field=TEXT, root=config.data.data_root) if config.model.rnn.pretrained_embedding: vec = torchtext.vocab.FastText(language='en', cache=config.data.fasttext_root) assert vec.dim == config.model.rnn.nemd else: vec = None TEXT.build_vocab(trainSet, vectors=vec) # TEXT: numericalize, pad, add init_token and eos_token trainLoader, devLoader, testLoader = BPTTIterator.splits( (trainSet, devSet, testSet), batch_size=config.data.BSZ, bptt_len=config.data.bptt_len, device=device) assert len(TEXT.vocab) == config.data.vocabSize
def train_lm( model_name, output_path, epochs=5, batch_size=32, bptt_len=35, lr=1e-3, optimizer="adam", min_freq=5, model_args={}, scheduler_patience=5, scheduler_threshold=1e-4, early_stopping_tolerance=5): """ Train and save a language model Arguments --------- model_name: string Can be "RNN", "QRNN" output_path: a path Where to save the model lr: float Learning rate, default = 1e-3 model_args: dict Arguments to be passed to the createdmodel """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') TEXT = data.Field( tokenizer_language='en', lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True, ) train, valid, test = WikiText2.splits(TEXT) TEXT.build_vocab(train, min_freq=min_freq) print(f"We have {len(TEXT.vocab)} tokens in our vocabulary") device = "cuda" if torch.cuda.is_available() else "cpu" train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=batch_size, bptt_len=bptt_len, # this is where we specify the sequence length device=device, repeat=False ) model = create_model(model_name, TEXT, model_args=model_args) if "awd" in model_name: optimizer = "asgd" optimizer = create_optimizer(model, optimizer, lr) criterion = nn.CrossEntropyLoss() print(f"Using LR Scheduler with patience {scheduler_patience} and threshold {scheduler_threshold}") lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=scheduler_patience, threshold=scheduler_threshold ) model = model.to(device) criterion = criterion.to(device) model_path = output_path training_cycle( epochs=epochs, model=model, train_iter=train_iter, valid_iter=valid_iter, optimizer=optimizer, criterion=criterion, scheduler=lr_scheduler, model_path=model_path, early_stopping_tolerance=early_stopping_tolerance ) model.load_state_dict(torch.load(model_path)) model.eval() valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion) test_loss, test_perplexity = evaluate(model, test_iter, criterion) print(f"Valid loss : {valid_loss:.2f}") print(f"Valid perplexity: {valid_perplexity:.2f}\n") print(f"Test loss : {test_loss:.2f}") print(f"Test perplexity: {test_perplexity:.2f}") save_model(model, TEXT, output_path)
import spacy from spacy.symbols import ORTH def spacy_tok(x): return [tok.lower() for tok in x] TEXT = data.Field(lower=True, tokenize=spacy_tok) from torchtext.datasets import WikiText2 train, valid, test = WikiText2.splits( TEXT ) # loading custom datasets requires passing in the field, but nothing else. TEXT.build_vocab(train, vectors="glove.6B.200d") train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, bptt_len=30, # this is where we specify the sequence length device=(0 if USE_GPU else -1), repeat=False) import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.autograd import Variable as V
LABELS.build_vocab(train) a = next(iter(data.BPTTIterator(train, 20, 20))) train_iter, dev_iter, test_iter = data.BPTTIterator.splits( ([i.text for i in train], dev, test), bptt_len=13, batch_size=7, sort_key=lambda x: len(x.text), device='cpu') # https://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/ from torchtext.datasets import WikiText2 train, valid, test = WikiText2.splits(TEXT) # loading custom datas len(train) data.Example?
from torch.optim import Adam import torch from nntoolbox.callbacks import * from nntoolbox.metrics import * MAX_VOCAB_SIZE = 25000 BATCH_SIZE = 16 TEXT = data.Field(tokenize='spacy') LABEL = data.LabelField(dtype=torch.float) # train_iterator, val_iterator, test_iterator = WikiText2.iters() # for tmp in train_iterator: # print(tmp) train_data, val_data, test_data = WikiText2.splits(TEXT) train_iterator = data.BPTTIterator(train_data, batch_size=BATCH_SIZE, sort_within_batch=True, device=get_device(), bptt_len=35, shuffle=True) val_iterator = data.BPTTIterator(val_data, batch_size=BATCH_SIZE, sort_within_batch=True, device=get_device(), bptt_len=35, shuffle=True) TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d")