def __init__(self, train_batch_size=20, eval_batch_size=10, pred_batch_size=1, bptt=35): # 'train': 36718, 'valid': 3760, 'test': 4358, self.bptt = bptt train_iter = WikiText2(split='train') self.tokenizer = get_tokenizer('basic_english') counter = Counter() txtline = [] for line in train_iter: txtline.append(line) counter.update(self.tokenizer(line)) self.vocab = Vocab(counter) train_iter, val_iter, test_iter = WikiText2() train_data = self.data_process(train_iter) val_data = self.data_process(val_iter) test_data = self.data_process(test_iter) pred_data = train_data self.train_data = self.batchify(train_data, train_batch_size) self.val_data = self.batchify(val_data, eval_batch_size) self.test_data = self.batchify(test_data, eval_batch_size) self.pred_data = self.batchify(pred_data, pred_batch_size) # 用于单行预测 self.text = txtline
def get_data(): train_iter = WikiText2(split='train') counter = Counter() for line in train_iter: counter.update(tokenizer(line)) vocab = Vocab(counter) train_iter, val_iter, test_iter = WikiText2() ''' i = 0 for item in train_iter: print(item) if i == 5: break i+=1 ''' train_data = data_process(train_iter, vocab) val_data = data_process(val_iter, vocab) test_data = data_process(test_iter, vocab) train_data = batchify(train_data, batch_size) val_data = batchify(val_data, eval_batch_size) test_data = batchify(test_data, eval_batch_size) return train_data, val_data, test_data, vocab
def __init__(self, device): train_iter, val_iter, test_iter = WikiText2() self.device = device self.tokenizer = get_tokenizer('basic_english') self.counter = Counter() self.counter.update(self.tokenizer('<sos>')) for line in train_iter: self.counter.update(self.tokenizer(line)) for line in val_iter: self.counter.update(self.tokenizer(line)) for line in test_iter: self.counter.update(self.tokenizer(line)) self.vocab = Vocab(self.counter) train_iter, val_iter, test_iter = WikiText2() self.train = self.data_process(train_iter).to(self.device) self.val = self.data_process(val_iter).to(self.device) self.test = self.data_process(test_iter).to(self.device)
def get_wiki2(conf): """ Return WikiText 2 iterators """ # raw data train_iter, test_iter, valid_iter = WikiText2(split=('train', 'test', 'valid')) train_iter_copy, test_iter_copy, valid_iter_copy = WikiText2(split=('train', 'test', 'valid')) # loader train, test, valid, vocab = load_dataset(train_iter, test_iter, valid_iter, train_iter_copy, test_iter_copy, valid_iter_copy, conf) return train, test, valid, vocab
def get_accuracy(ps_rref, data_dir, test_batch_size, job_name, target_loss): logger = Logger( job_name=job_name, file_dir=f'./measurement/logs/{job_name}_tester.log').logger train_iter = WikiText2(root=data_dir, split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) bptt = 35 train_iter, val_iter, test_iter = WikiText2(root=data_dir) val_data = data_process(val_iter, vocab, tokenizer) val_data = batchify(val_data, test_batch_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") criterion = nn.CrossEntropyLoss() t0 = time.time() logger.info("Start!") init = t0 while True: t1 = time.time() if t1 - t0 > 20: t0 = t1 m = ps_rref.rpc_sync().get_model().to(device) test_loss = 0. with torch.no_grad(): hidden = m.init_hidden(test_batch_size) for batch_idx, i in enumerate( range(0, val_data.size(0) - 1, bptt)): data, targets = get_batch(val_data, i, bptt) data, targets = data.to(device), targets.to(device) hidden = repackage_hidden(hidden) output, hidden = m(data, hidden) loss = criterion(output, targets) test_loss += len(data) * loss.item() test_loss /= (len(val_data) - 1) logger.info("Test Loss: {:7.3f} | Time: {:7.2f} seconds".format( test_loss, (t1 - init))) if test_loss < target_loss: ps_rref.rpc_sync().stop() break
def test_wikitext2(self): from torchtext.experimental.datasets import WikiText2 cachedir = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(cachedir) cachefile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") conditional_remove(cachefile) train_dataset, valid_dataset, test_dataset = WikiText2() train_data = torch.cat( tuple(filter(lambda t: t.numel() > 0, train_dataset))) valid_data = torch.cat( tuple(filter(lambda t: t.numel() > 0, valid_dataset))) test_data = torch.cat( tuple(filter(lambda t: t.numel() > 0, test_dataset))) self._helper_test_func(len(train_data), 2049990, train_data[20:25], [5024, 89, 21, 3, 1838]) self._helper_test_func(len(test_data), 241859, test_data[30:35], [914, 4, 36, 11, 569]) self._helper_test_func(len(valid_data), 214417, valid_data[40:45], [925, 8, 2, 150, 8575]) vocab = train_dataset.get_vocab() tokens_ids = [ vocab[token] for token in 'the player characters rest'.split() ] self.assertEqual(tokens_ids, [2, 286, 503, 700]) # Add test for the subset of the standard datasets train_iter, valid_iter, test_iter = torchtext.experimental.datasets.raw.WikiText2( data_select=('train', 'valid', 'test')) self._helper_test_func(len(train_iter), 36718, next(iter(train_iter)), ' \n') self._helper_test_func(len(valid_iter), 3760, next(iter(valid_iter)), ' \n') self._helper_test_func(len(test_iter), 4358, next(iter(test_iter)), ' \n') del train_iter, valid_iter, test_iter train_dataset, test_dataset = WikiText2(data_select=('train', 'test')) train_data = torch.cat( tuple(filter(lambda t: t.numel() > 0, train_dataset))) test_data = torch.cat( tuple(filter(lambda t: t.numel() > 0, test_dataset))) self._helper_test_func(len(train_data), 2049990, train_data[20:25], [5024, 89, 21, 3, 1838]) self._helper_test_func(len(test_data), 241859, test_data[30:35], [914, 4, 36, 11, 569]) conditional_remove(cachedir) conditional_remove(cachefile)
def test_wikitext2(self): from torchtext.experimental.datasets import WikiText2 # smoke test to ensure wikitext2 works properly # NOTE # test_wikitext2 and test_wikitext2_legacy have some cache incompatibility. # Keeping one's cache make the other fail. So we need to clean up the cache dir cachedir = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(cachedir) cachefile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") conditional_remove(cachefile) train_dataset, test_dataset, valid_dataset = WikiText2() self.assertEqual(len(train_dataset), 2049990) self.assertEqual(len(test_dataset), 241859) self.assertEqual(len(valid_dataset), 214417) vocab = train_dataset.get_vocab() tokens_ids = [ vocab[token] for token in 'the player characters rest'.split() ] self.assertEqual(tokens_ids, [2, 286, 503, 700]) conditional_remove(cachedir) conditional_remove(cachefile)
def __init__(self, train_batch_size=20, eval_batch_size=10, bptt=35): self.bptt = bptt train_iter = WikiText2(split='train') self.tokenizer = get_tokenizer('basic_english') counter = Counter() for line in train_iter: counter.update(self.tokenizer(line)) self.vocab = Vocab(counter) train_iter, val_iter, test_iter = WikiText2() train_data = self.data_process(train_iter) val_data = self.data_process(val_iter) test_data = self.data_process(test_iter) self.train_data = self.batchify(train_data, train_batch_size) self.val_data = self.batchify(val_data, eval_batch_size) self.test_data = self.batchify(test_data, eval_batch_size)
def gen_tokenizer_and_vocab(): train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') counter = Counter() for line in train_iter: counter.update(tokenizer(line)) vocab = RetiredVocab(counter) return tokenizer, vocab
def create_datasets(self): field = Field(tokenize=list) train, val, test = WikiText2.splits(field, root='wikitext2_data') field.build_vocab(train, vectors=None) trains, vals, _ = BPTTIterator.splits((train, val, test), batch_size=self.args.batch, bptt_len=self.args.bptt_len, device=torch.device('cpu')) return trains, vals
def get_data(): train_iter = WikiText2(split='train') # download the train iterator counter = Counter() # instantiate a Counter istance # update the counter with the tokens (kind of a dictionary) for line in train_iter: counter.update(tokenizer(line)) vocab = Vocab(counter) # create a Vocab from the counter train_iter, val_iter, test_iter = WikiText2() train_data = preprocess(train_iter, vocab) val_data = preprocess(val_iter, vocab) test_data = preprocess(test_iter, vocab) train_data = batchify(train_data, batch_size) val_data = batchify(val_data, eval_batch_size) test_data = batchify(test_data, eval_batch_size) return train_data, val_data, test_data, vocab
def load_data(device): train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') counter = Counter() for line in train_iter: counter.update(tokenizer(line)) vocab = Vocab(counter) train_iter, val_iter, test_iter = WikiText2() train_data = data_process(train_iter, tokenizer, vocab) val_data = data_process(val_iter, tokenizer, vocab) test_data = data_process(test_iter, tokenizer, vocab) batch_size = 20 eval_batch_size = 10 train_data = batchfy(train_data, batch_size, device) val_data = batchfy(val_data, eval_batch_size, device) test_data = batchfy(test_data, eval_batch_size, device) return vocab, train_data, val_data, test_data
def WikiTexts(batch_size=32, bptt=30, vectors="glove.6B.100d"): my_tok = spacy.load('en') #my_tok.tokenizer.add_special_case('<eos>', [{ORTH: '<eos>'}]) #my_tok.tokenizer.add_special_case('<bos>', [{ORTH: '<bos>'}]) #my_tok.tokenizer.add_special_case('<unk>', [{ORTH: '<unk>'}]) TEXT = data.Field(lower=True, tokenize=spacy_tok) train, valid, test = WikiText2.splits(TEXT) TEXT.build_vocab(train, vectors=vectors) train_loader, val_loader, test_loader = data.BPTTIterator.splits( (train, valid, test), batch_size=batch_size, bptt_len=bptt, # this is where we specify the sequence length #device=(0 if USE_GPU else -1), repeat=False) return train_loader, val_loader, test_loader, TEXT
def get_data(self): ''' Retrieves data in a format that can be used in training by loading in batches. Returns ------- obj Object loaded with language data. obj Torchtext data iterator. int Vocab size in the text dataset. obj Field object from Torchtext. obj Vocabulary taken from Torchtext Field. ''' TEXT = Field(tokenize=self.tokenizer, lower=True) train, valid, test = WikiText2.splits(TEXT) TEXT.build_vocab() vocab_size = len(TEXT.vocab) train_iter, valid_iter = BPTTIterator.splits( (train, valid), batch_size=self.config.batch_size, bptt_len=8, device=self.device, repeat=False) train_loader = Batch(dl=train_iter, x_var='text') valid_loader = Batch(dl=valid_iter, x_var='text') print(len(train_loader)) data_dict = edict({ 'train_loader': train_loader, 'valid_loader': valid_loader, 'train_iter': train_iter, 'vocab_size': vocab_size, 'vocab': TEXT.vocab }) return data_dict
def test_wikitext2(self): from torchtext.experimental.datasets import WikiText2 # smoke test to ensure wikitext2 works properly train_dataset, test_dataset, valid_dataset = WikiText2() self.assertEqual(len(train_dataset), 2049990) self.assertEqual(len(test_dataset), 241859) self.assertEqual(len(valid_dataset), 214417) vocab = train_dataset.get_vocab() tokens_ids = [ vocab[token] for token in 'the player characters rest'.split() ] self.assertEqual(tokens_ids, [2, 286, 503, 700]) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") conditional_remove(datafile)
def evaluate_lm(model_path): """ Evaluate language model against Wiki2 Arguments --------- model_path: string Can be "RNN", "QRNN" """ device = "cuda" if torch.cuda.is_available() else "cpu" model, TEXT = load_model(model_path, device) train, valid, test = WikiText2.splits(TEXT) BATCH_SIZE = 32 BPTT_LEN = 30 train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, bptt_len=BPTT_LEN, # this is where we specify the sequence length device=device, repeat=False) criterion = nn.CrossEntropyLoss() model.eval() valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion) test_loss, test_perplexity = evaluate(model, test_iter, criterion) print(f"Valid loss : {valid_loss:.3f}") print(f"Valid perplexity: {valid_perplexity:.2f}\n") print(f"Test loss : {test_loss:.3f}") print(f"Test perplexity: {test_perplexity:.2f}")
for epoch in range(20): train(model) val_loss = evaluate(model, val_data) if val_loss < best_val_loss: best_val_loss = val_loss scheduler.step() best_val_ppl = math.exp(best_val_loss) nni.report_final_result( best_val_ppl ) # reports best validation ppl to nni as final result of one trial if __name__ == "__main__": train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>']) vocab.set_default_index(vocab['<unk>']) n_token = len(vocab) base_model = Transformer(n_token) evaluator = FunctionalEvaluator(fit) exp = RetiariiExperiment(base_model, evaluator, [], strategy.Random()) exp_config = RetiariiExeConfig('local') exp_config.experiment_name = 'transformer tuning' exp_config.trial_concurrency = 3 # please change configurations accordingly exp_config.max_trial_number = 25 exp_config.trial_gpu_number = 1
LABELS.build_vocab(train) a = next(iter(data.BPTTIterator(train, 20, 20))) train_iter, dev_iter, test_iter = data.BPTTIterator.splits( ([i.text for i in train], dev, test), bptt_len=13, batch_size=7, sort_key=lambda x: len(x.text), device='cpu') # https://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/ from torchtext.datasets import WikiText2 train, valid, test = WikiText2.splits(TEXT) # loading custom datas len(train) data.Example?
def main(args): if args.device: device = args.device else: device = 'cuda' if torch.cuda.is_available() else 'cpu' text_field = data.Field(tokenize=list) datasets = WikiText2.splits(text_field) text_field.build_vocab(datasets[0]) train_iter, test_iter, val_iter = data.BPTTIterator.splits(datasets, batch_size=32, bptt_len=512, device=device) vocab = text_field.vocab print(f'Vocab size: {len(vocab)}') model_args = dict(rnn_type='lstm', ntoken=args.num_latents, ninp=256, nhid=1024, nlayers=2) if args.model_args: model_args.update(dict(eval(args.model_args))) model = SHARNN(**model_args).to(device) model.train() criterion = nn.NLLLoss() #optim = torch.optim.SGD(model.parameters(), lr=5.0) optim = torch.optim.Adam(model.parameters(), lr=2e-3) for epoch in range(10): hidden = None mems = None total_loss = 0 for step, batch in enumerate(train_iter): optim.zero_grad() if hidden is not None: hidden = repackage_hidden(hidden) if mems is not None: mems = repackage_hidden(mems) output, hidden, mems, attn_outs, _ = model(batch.text, hidden, return_h=True, mems=mems) logits = model.decoder(output) logits = F.log_softmax(logits, dim=-1) assert logits.size(1) == batch.target.size(1) loss = criterion(logits.view(-1, logits.size(-1)), batch.target.view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optim.step() total_loss += loss.data if step % args.log_interval == 0 and step > 0: cur_loss = total_loss / args.log_interval print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, step, len(train_iter), optim.param_groups[0]['lr'], cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0
def main(): args = parser.parse_args() tqdm.monitor_interval = 0 tmp = os.environ.get('SLURM_TMPDIR') scratch = os.environ.get('SCRATCH') project = os.environ.get('project') torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False random.seed(1) if args.dataset == 'WikiText103': train_iter = WikiText103(root=tmp, split='train') print(f'dataset {args.dataset}') elif args.dataset == 'WikiText2': train_iter = WikiText2(root=tmp, split='train') print(f'dataset {args.dataset}') else: print('dataset not implemented!') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>']) vocab.set_default_index(vocab['<unk>']) if args.dataset == 'WikiText103': train_iter, val_iter, test_iter = torchtext.datasets.WikiText103( root=tmp, split=('train', 'valid', 'test')) elif args.dataset == 'WikiText2': train_iter, val_iter, test_iter = torchtext.datasets.WikiText2( root=tmp, split=('train', 'valid', 'test')) else: print('dataset not implemented!') path = Path.cwd() if args.dataset == 'WikiText103': pathLog = path / 'logs/wikitext103' pathSaved = path / 'saved' else: pathLog = path / 'logs/wikitext2' pathSaved = path / 'saved/wikitext2' def data_process(raw_text_iter): """Converts raw text into a flat Tensor.""" data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_data = data_process(train_iter) val_data = data_process(val_iter) test_data = data_process(test_iter) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def batchify(data, bsz): """Divides the data into bsz separate sequences, removing extra elements that wouldn't cleanly fit. Args: data: Tensor, shape [N] bsz: int, batch size Returns: Tensor of shape [N // bsz, bsz] """ seq_len = data.size(0) // bsz data = data[:seq_len * bsz] data = data.view(bsz, seq_len).t().contiguous() return data batch_size = args.batch_size eval_batch_size = int(args.batch_size // 2) train_data = batchify(train_data, batch_size) # shape [seq_len, batch_size] val_data = batchify(val_data, eval_batch_size) test_data = batchify(test_data, eval_batch_size) bptt = args.bptt def get_batch(source, i): """ Args: source: Tensor, shape [full_seq_len, batch_size] i: int Returns: tuple (data, target), where data has shape [seq_len, batch_size] and target has shape [seq_len * batch_size] """ seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].reshape(-1) return data, target ntokens = len(vocab) # size of vocabulary emsize = args.emsize # embedding dimension d_hid = args.d_hid # dimension of the feedforward network model in nn.TransformerEncoder nlayers = args.nlayers # number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = args.nhead # number of heads in nn.MultiheadAttention dropout = args.dropout # dropout probability model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device) criterion = nn.CrossEntropyLoss() n_gpus = torch.cuda.device_count() print( f'batch size: {batch_size}, bptt: {bptt}, seed: {args.seed}, ngpus: {n_gpus}' ) print(f'len vocab: {ntokens}, embeddingSize: {emsize}, hiddenDim: {d_hid}') print(f'nlayers: {nlayers}, nAttentionHead: {nhead}, dropout: {dropout}') def train(model, train_data, bptt): model.train() # turn on train mode total_loss = 0. count = 0 log_interval = 5000 # start_time = time.time() src_mask = generate_square_subsequent_mask(bptt).to(device) num_batches = len(range(0, train_data.size(0) - 1, bptt)) progress = tqdm(total=num_batches) for batch, i in enumerate((range(0, train_data.size(0) - 1, bptt))): # if batch<140000: continue data, targets = get_batch(train_data, i) data = data.to(device) targets = targets.to(device) batch_size = data.size(0) if batch_size != bptt: # only on last batch src_mask = src_mask[:batch_size, :batch_size] output = model(data, src_mask) loss = criterion(output.view(-1, ntokens), targets) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_loss += loss.detach() * batch_size count += 1 if batch % log_interval == 0 and batch != 0: progress.update(log_interval) return total_loss / (len(train_data) - 1), count def evaluate(model, eval_data, bptt): model.eval() # turn on evaluation mode total_loss = 0. src_mask = generate_square_subsequent_mask(bptt).to(device) with torch.no_grad(): for batch, i in enumerate(range(0, eval_data.size(0) - 1, bptt)): data, targets = get_batch(eval_data, i) data = data.to(device) targets = targets.to(device) batch_size = data.size(0) if batch_size != bptt: src_mask = src_mask[:batch_size, :batch_size] output = model(data, src_mask) output_flat = output.view(-1, ntokens) total_loss += batch_size * criterion(output_flat, targets) return total_loss / (len(eval_data) - 1) stepSize = len(range(0, train_data.size(0) - 1, bptt)) lr = args.lr if args.optim == 'AdamW': optimizer = O.AdamW(model.parameters(), lr, weight_decay=args.wd) elif args.optim == 'Cons': optimizer = ConsciousLR(model.parameters(), stepSize, lr, weight_decay=args.wd) elif args.optim == 'Agg': optimizer = ConsciousLR(model.parameters(), stepSize, lr, weight_decay=args.wd, lrHigh=2., lrLow=.5) elif args.optim == 'RAdamCons': optimizer = RAdamConsciousLR(model.parameters(), stepSize, lr, weight_decay=args.wd) elif args.optim == 'RAdamAgg': optimizer = RAdamConsciousLR(model.parameters(), stepSize, lr, weight_decay=args.wd, lrHigh=2., lrLow=.5) elif args.optim == 'RAdam': optimizer = RAdam(model.parameters(), lr, weight_decay=args.wd) elif args.optim == 'AdaBelief': optimizer = AdaBelief(model.parameters(), lr, weight_decay=args.wd) else: print('optimizer not implemented!!!') print(optimizer) best_test_loss = float('inf') epochs = args.max_epochs train_losses = [] val_ppls = [] test_ppls = [] for epoch in range(1, epochs + 1): epoch_start_time = time.time() trainLoss, count = train(model, train_data, bptt) trainLoss = trainLoss.item() train_losses.append(trainLoss) val_loss = evaluate(model, val_data, bptt) val_loss = val_loss.item() val_ppl = math.exp(val_loss) val_ppls.append(val_ppl) test_loss = evaluate(model, test_data, bptt) test_loss = test_loss.item() test_ppl = math.exp(test_loss) test_ppls.append(test_ppl) elapsed = time.time() - epoch_start_time print('-' * 89) print( f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | trainLoss: {trainLoss:5.2f}' f' | valid ppl {val_ppl:8.2f}| test ppl {test_ppl:8.2f} |') print('-' * 89) if test_loss < best_test_loss: dic = { 'model': model, 'epoch': epoch, 'val_loss': val_loss, 'val_ppl': val_ppl, 'train_loss': trainLoss, 'test_ppl': test_ppl } if args.dataset == 'WikiText103': torch.save(dic, pathSaved / f'{args.optim}_{args.lr}_103model.pt') else: torch.save(dic, pathSaved / f'{args.optim}_{args.lr}_2model.pt') best_val_loss = val_loss best_val_ppl = val_ppl best_epoch = epoch best_train_loss = trainLoss best_test_loss = test_loss best_test_ppl = test_ppl log = { 'train_losses': train_losses, "val_ppls": val_ppls, 'test_ppls': test_ppls, 'best_epoch': best_epoch, 'best_val_ppl': best_val_ppl, 'best_val_loss': best_val_loss, 'best_train_loss': best_train_loss, 'best_test_ppl': best_test_ppl, 'best_test_loss': best_test_loss } if args.dataset == 'WikiText103': with open(pathLog / f'{args.optim}_{args.lr}_103.json', 'w') as fp: json.dump(log, fp) else: with open(pathLog / f'{args.optim}_{args.lr}_2.json', 'w') as fp: json.dump(log, fp) print(f'best test ppl: {best_test_ppl}') print(log)
# \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & # \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & # \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} # \end{bmatrix} # # These columns are treated as independent by the model, which means that # the dependence of ``G`` and ``F`` can not be learned, but allows more # efficient batch processing. # import torch from torchtext.datasets import WikiText2 from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) def data_process(raw_text_iter): data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_iter, val_iter, test_iter = WikiText2()
def segment(doc): """ 用 Spacy 库做分词, 将一段文档切割到若干词汇. """ tokenizer = tokenize.tokenizer return [token.text for token in tokenizer(doc)] # 定义特征域, 表示一段文本, 要求按规则分词并小写化预处理数据集. TEXT = data.Field(lower=True, tokenize=segment) # datasets 中存在一些准备好的数据集, 例如下面的 WikiText2, 另外这个 # 命令会在项目目录下自动创建目录 .data 并下载数据 (4.4M), 当然为了能 # 减少读者的疑惑, 在 data 文件夹下 copy 了一份相同的. train_set, valid_set, test_set = WikiText2.splits(TEXT) # 下面看看 train/valid/test 分别有多少条数据在其中 (没分词). print(len(train_set), len(valid_set), len(test_set), end="\n\n") # 在构建数据集的同时也可以加入预训练的词向量, 当然这里注释掉了. TEXT.build_vocab(train_set) # vectors="data/glove.6B.200d" # 语言模型的核心便是 Iterator, 有子类为 BPTTIterator. 其特殊功能便 # 是将文本连续地切成一段段等长的序列并做 batch, 称为 bbpt, 例如: # # "Machine learning is a field of computer science # that gives computers the ability to learn without # being explicitly programmed" # # 如果规定连续切割长度为 5, 则上述文本会生成一下列表:
def run_worker(rank, world_size): ###################################################################### # Load and batch data # ------------------- # ###################################################################### # The training process uses Wikitext-2 dataset from ``torchtext``. The # vocab object is built based on the train dataset and is used to numericalize # tokens into tensors. Starting from sequential data, the ``batchify()`` # function arranges the dataset into columns, trimming off any tokens remaining # after the data has been divided into batches of size ``batch_size``. # For instance, with the alphabet as the sequence (total length of 26) # and a batch size of 4, we would divide the alphabet into 4 sequences of # length 6: # # .. math:: # \begin{bmatrix} # \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} # \end{bmatrix} # \Rightarrow # \begin{bmatrix} # \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & # \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & # \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & # \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} # \end{bmatrix} # # These columns are treated as independent by the model, which means that # the dependence of ``G`` and ``F`` can not be learned, but allows more # efficient batch processing. # # In 'run_worker' def print_with_rank(msg): print('[RANK {}]: {}'.format(rank, msg)) from torchtext.datasets import WikiText2 from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) def data_process(raw_text_iter): data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_iter, val_iter, test_iter = WikiText2() train_data = data_process(train_iter) val_data = data_process(val_iter) test_data = data_process(test_iter) device = torch.device(2 * rank) def batchify(data, bsz, rank, world_size, is_train=False): # Divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() # Divide the data across the ranks only for training data. if is_train: data_per_rank = data.size(0) // world_size data = data[rank * data_per_rank:(rank + 1) * data_per_rank] return data.to(device) batch_size = 20 eval_batch_size = 10 train_data = batchify(train_data, batch_size, rank, world_size, True) val_data = batchify(val_data, eval_batch_size, rank, world_size) test_data = batchify(test_data, eval_batch_size, rank, world_size) ###################################################################### # Functions to generate input and target sequence # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ###################################################################### # ``get_batch()`` function generates the input and target sequence for # the transformer model. It subdivides the source data into chunks of # length ``bptt``. For the language modeling task, the model needs the # following words as ``Target``. For example, with a ``bptt`` value of 2, # we’d get the following two Variables for ``i`` = 0: # # .. image:: ../_static/img/transformer_input_target.png # # It should be noted that the chunks are along dimension 0, consistent # with the ``S`` dimension in the Transformer model. The batch dimension # ``N`` is along dimension 1. # # In 'run_worker' bptt = 35 def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) # Need batch dimension first for pipeline parallelism. return data.t(), target ###################################################################### # Model scale and Pipe initialization # ----------------------------------- # ###################################################################### # To demonstrate training large Transformer models using pipeline parallelism, # we scale up the Transformer layers appropriately. We use an embedding # dimension of 4096, hidden size of 4096, 16 attention heads and 8 total # transformer layers (``nn.TransformerEncoderLayer``). This creates a model with # **~1 billion** parameters. # # We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__ # since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ # which allows for future expansion to cross host pipelining. We need to # initialize the RPC framework with only a single worker since we're using a # single process to drive multiple GPUs. # # The pipeline is then initialized with 8 transformer layers on one GPU and 8 # transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and # another across GPUs 2 and 3. Both pipes are then replicated using DistributedDataParallel. # In 'run_worker' ntokens = len(vocab) # the size of vocabulary emsize = 4096 # embedding dimension nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 8 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 16 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from torch.distributed import rpc tmpfile = tempfile.NamedTemporaryFile() rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), # Specifying _transports and _channels is a workaround and we no longer # will have to specify _transports and _channels for PyTorch # versions >= 1.8.1 _transports=["ibv", "uv"], _channels=["cuda_ipc", "cuda_basic"], )) # Num gpus for model parallelism. num_gpus = 2 partition_len = ((nlayers - 1) // num_gpus) + 1 # Add encoder in the beginning. tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)] module_list = [] # Add all the necessary transformer blocks. for i in range(nlayers): transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) if i != 0 and i % (partition_len) == 0: module_list.append(nn.Sequential(*tmp_list)) tmp_list = [] device = i // (partition_len) tmp_list.append(transformer_block.to(2 * rank + device)) # Add decoder in the end. tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1)) module_list.append(nn.Sequential(*tmp_list)) # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing # doesn't work with DDP. from torch.distributed.pipeline.sync import Pipe chunks = 8 model = Pipe(torch.nn.Sequential(*module_list), chunks=chunks, checkpoint="never") # Initialize process group and wrap model in DDP. from torch.nn.parallel import DistributedDataParallel import torch.distributed as dist os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) model = DistributedDataParallel(model) def get_total_params(module: torch.nn.Module): total_params = 0 for param in module.parameters(): total_params += param.numel() return total_params print_with_rank('Total parameters in model: {:,}'.format( get_total_params(model))) ###################################################################### # Run the model # ------------- # ###################################################################### # `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__ # is applied to track the loss and # `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__ # implements stochastic gradient descent method as the optimizer. The initial # learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is # applied to adjust the learn rate through epochs. During the # training, we use # `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__ # function to scale all the gradient together to prevent exploding. # # In 'run_worker' criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) import time def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(vocab) # Train only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, train_data.size(0) - 1) for batch, i in enumerate(range(0, nbatches, bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad() # Since the Pipe is only within a single host and process the ``RRef`` # returned by forward method is local to this node and can simply # retrieved via ``RRef.local_value()``. output = model(data).local_value() # Need to move targets to the device where the output of the # pipeline resides. loss = criterion(output.view(-1, ntokens), targets.cuda(2 * rank + 1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_loss += loss.item() log_interval = 10 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, nbatches // bptt, scheduler.get_last_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() def evaluate(eval_model, data_source): eval_model.eval() # Turn on the evaluation mode total_loss = 0. ntokens = len(vocab) # Evaluate only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, data_source.size(0) - 1) with torch.no_grad(): for i in range(0, nbatches, bptt): data, targets = get_batch(data_source, i) output = eval_model(data).local_value() output_flat = output.view(-1, ntokens) # Need to move targets to the device where the output of the # pipeline resides. total_loss += len(data) * criterion( output_flat, targets.cuda(2 * rank + 1)).item() return total_loss / (len(data_source) - 1) ###################################################################### # Loop over epochs. Save the model if the validation loss is the best # we've seen so far. Adjust the learning rate after each epoch. # In 'run_worker' best_val_loss = float("inf") epochs = 3 # The number of epochs best_model = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train() val_loss = evaluate(model, val_data) print_with_rank('-' * 89) print_with_rank( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print_with_rank('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() ###################################################################### # Evaluate the model with the test dataset # ------------------------------------- # # Apply the best model to check the result with the test dataset. # In 'run_worker' test_loss = evaluate(best_model, test_data) print_with_rank('=' * 89) print_with_rank( '| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print_with_rank('=' * 89)
from src.model.train_evaluate import train, evaluate from src.model.model_utils import data_process, batchify, gen_tokenizer_and_vocab from src.settings import DIR_MODELS """ Note: the ipynb has modified version of code below; this should be functionalized and integrated """ if __name__ == '__main__': device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer, vocab = gen_tokenizer_and_vocab() train_iter, val_iter, test_iter = WikiText2() train_data = data_process(train_iter, vocab, tokenizer) val_data = data_process(val_iter, vocab, tokenizer) test_data = data_process(test_iter, vocab, tokenizer) batch_size = 20 eval_batch_size = 10 train_data = batchify(train_data, batch_size, device) val_data = batchify(val_data, eval_batch_size, device) test_data = batchify(test_data, eval_batch_size, device) ntokens = len(vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models
import spacy from spacy.symbols import ORTH def spacy_tok(x): return [tok.lower() for tok in x] TEXT = data.Field(lower=True, tokenize=spacy_tok) from torchtext.datasets import WikiText2 train, valid, test = WikiText2.splits( TEXT ) # loading custom datasets requires passing in the field, but nothing else. TEXT.build_vocab(train, vectors="glove.6B.200d") train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, bptt_len=30, # this is where we specify the sequence length device=(0 if USE_GPU else -1), repeat=False) import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.autograd import Variable as V
def forward(self, x): # x += [x.size(0), 1, d_model], x内的每个元素对应加 x = x + self.pe[:x.size(0), :] return self.dropout(x) # %% [markdown] # # 1️⃣Load and batch data import os import torch from torchtext.datasets import WikiText2 from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator, vocab train_iter = WikiText2('data', split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) def data_process(raw_text_iter): # tokenizer to seg text, and vocab to trans to num data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] # discard 0 element text and cat them. numel func to vector element num. return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
def train_lm( model_name, output_path, epochs=5, batch_size=32, bptt_len=35, lr=1e-3, optimizer="adam", min_freq=5, model_args={}, scheduler_patience=5, scheduler_threshold=1e-4, early_stopping_tolerance=5): """ Train and save a language model Arguments --------- model_name: string Can be "RNN", "QRNN" output_path: a path Where to save the model lr: float Learning rate, default = 1e-3 model_args: dict Arguments to be passed to the createdmodel """ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') TEXT = data.Field( tokenizer_language='en', lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True, ) train, valid, test = WikiText2.splits(TEXT) TEXT.build_vocab(train, min_freq=min_freq) print(f"We have {len(TEXT.vocab)} tokens in our vocabulary") device = "cuda" if torch.cuda.is_available() else "cpu" train_iter, valid_iter, test_iter = data.BPTTIterator.splits( (train, valid, test), batch_size=batch_size, bptt_len=bptt_len, # this is where we specify the sequence length device=device, repeat=False ) model = create_model(model_name, TEXT, model_args=model_args) if "awd" in model_name: optimizer = "asgd" optimizer = create_optimizer(model, optimizer, lr) criterion = nn.CrossEntropyLoss() print(f"Using LR Scheduler with patience {scheduler_patience} and threshold {scheduler_threshold}") lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=scheduler_patience, threshold=scheduler_threshold ) model = model.to(device) criterion = criterion.to(device) model_path = output_path training_cycle( epochs=epochs, model=model, train_iter=train_iter, valid_iter=valid_iter, optimizer=optimizer, criterion=criterion, scheduler=lr_scheduler, model_path=model_path, early_stopping_tolerance=early_stopping_tolerance ) model.load_state_dict(torch.load(model_path)) model.eval() valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion) test_loss, test_perplexity = evaluate(model, test_iter, criterion) print(f"Valid loss : {valid_loss:.2f}") print(f"Valid perplexity: {valid_perplexity:.2f}\n") print(f"Test loss : {test_loss:.2f}") print(f"Test perplexity: {test_perplexity:.2f}") save_model(model, TEXT, output_path)
def fit(model_cls): train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>']) vocab.set_default_index(vocab['<unk>']) def process_data(raw_text_iter): """Converts raw text into a flat Tensor.""" data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_iter, val_iter, _ = WikiText2() train_data = process_data(train_iter) val_data = process_data(val_iter) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def generate_batches(data, bsz): """Divides the data into bsz separate sequences.""" seq_len = data.size(0) // bsz data = data[:seq_len * bsz] data = data.view(bsz, seq_len).t().contiguous() return data.to(device) batch_size = 20 eval_batch_size = 10 train_data = generate_batches(train_data, batch_size) val_data = generate_batches(val_data, eval_batch_size) seq_len = 35 def get_seq(source, i): """ Args: source: Tensor, with size [full_seq_len, batch_size] i: int Returns: tuple (data, target): data has size [seq_len, batch_size] and target has size [seq_len * batch_size] """ part_len = min(seq_len, len(source) - 1 - i) data = source[i:i + part_len] target = source[i + 1:i + 1 + part_len].reshape(-1) return data, target def generate_square_subsequent_mask(sz): """Generates an upper-triangular matrix of -inf, with zeros on diag.""" return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1) model = model_cls().to(device) lr = 5.0 optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) def train(model): model.train() src_mask = generate_square_subsequent_mask(seq_len).to(device) for i in range(0, train_data.size(0) - 1, seq_len): data, target = get_seq(train_data, i) part_len = data.size(0) if part_len != seq_len: src_mask = src_mask[:part_len, :part_len] output = model(data, src_mask) loss = F.cross_entropy(output.view(-1, output.size(-1)), target) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() def evaluate(model, eval_data): model.eval() src_mask = generate_square_subsequent_mask(seq_len).to(device) total_loss = 0. with torch.no_grad(): for i in range(0, eval_data.size(0) - 1, seq_len): data, target = get_seq(eval_data, i) part_len = data.size(0) if part_len != seq_len: src_mask = src_mask[:part_len, :part_len] output = model(data, src_mask) output_flat = output.view(-1, output.size(-1)) total_loss += part_len * F.cross_entropy(output_flat, target).item() return total_loss / (len(eval_data) - 1) best_val_loss = float('inf') for epoch in range(20): train(model) val_loss = evaluate(model, val_data) if val_loss < best_val_loss: best_val_loss = val_loss scheduler.step() best_val_ppl = math.exp(best_val_loss) nni.report_final_result( best_val_ppl ) # reports best validation ppl to nni as final result of one trial
def run_worker(ps_rref, data_dir, batch_size, num_epochs, worker, job_name): worker_rank = int(worker[-1]) info_socketm = znet.SocketMsger.tcp_connect(DORKER0_IP, INFO_PORT) info_socketm.send("WORKER") info_socketm.send(f"1.0\n/home/ubuntu/measurement/logs/{job_name}_info{worker_rank}.log\n{job_name}") logger = Logger(job_name=job_name, file_dir=f"./measurement/logs/{job_name}_{worker}.log").logger train_iter = WikiText2(root=data_dir, split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) ntokens = len(vocab) bptt = 35 train_iter, val_iter, test_iter = WikiText2(root=data_dir) train_data = data_process(train_iter, vocab, tokenizer) train_data = batchify(train_data, batch_size) device_id = 0 device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu") name = rpc.get_worker_info().name ps_rref.rpc_sync().set_ps_launched_to_true() m = ps_rref.rpc_sync().get_model().to(device) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) stop_flag = False info_socketm.send("START") if info_socketm.recv() != "CONFIRM": return cm_t1_end = time.time() tt0 = time.time() for epoch in range(num_epochs): for batch_idx, i in enumerate(range(0, train_data.size(0) - 1, bptt)): data, target = get_batch(train_data, i, bptt) data, target = data.to(device), target.to(device) output = m(data) output = output.view(-1, ntokens) loss = criterion(output, target) loss.backward() cm_t0_start = time.time() cp_t = 1000 * (cm_t0_start - cm_t1_end) logger.info("{:8s} | Epoch: {:3d} | Batch: {:3d} | Loss: {:6.2f} | Computation Time: {:7.2f} ms" .format(name, (epoch + 1), (batch_idx + 1), loss.item(), cp_t)) m, stop_flag = rpc.rpc_sync( to=ps_rref.owner(), func=ParameterServer.update_and_fetch_model, args=(ps_rref, [p.grad for p in m.cpu().parameters()], name, epoch, batch_idx, cm_t0_start, cm_t1_end)) m.to(device) cm_t1_end = time.time() if stop_flag: break if stop_flag: break tt1 = time.time() info_socketm.send("END") logger.info("Time: {:.2f} seconds".format((tt1 - tt0)))
loss = criterion(outs.view(-1, outs.size(-1)), targets.view(-1)) epoch_loss += loss.item() return epoch_loss / len(devLoader) ############################################################################### # Load data ############################################################################### configfile = open('./config.yaml') config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader)) device = torch.device(args.device) # ? include lenghts TEXT = Field(lower=True, include_lengths=False, batch_first=False) # TEXT: split string into tokens trainSet, devSet, testSet = WikiText2.splits(text_field=TEXT, root=config.data.data_root) if config.model.rnn.pretrained_embedding: vec = torchtext.vocab.FastText(language='en', cache=config.data.fasttext_root) assert vec.dim == config.model.rnn.nemd else: vec = None TEXT.build_vocab(trainSet, vectors=vec) # TEXT: numericalize, pad, add init_token and eos_token trainLoader, devLoader, testLoader = BPTTIterator.splits( (trainSet, devSet, testSet), batch_size=config.data.BSZ, bptt_len=config.data.bptt_len, device=device) assert len(TEXT.vocab) == config.data.vocabSize