def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = ds.ntokens hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = ds.get_batch(data_source, i) targets = targets.view(-1) output, hidden = model(data, hidden) total_loss += len(data) * criterion( model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def evaluate_scores(epoch, batch_size): model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = ds.ntokens hidden = model.init_hidden(batch_size) for data, targets in ds.train_seq(): targets = targets.view(-1).contiguous() output, hidden = model(data, hidden) loss = criterion(model.decoder.weight, model.decoder.bias, output, targets).data sk.add_prior_sample(epoch, loss.item()) total_loss += len(data) * loss hidden = repackage_hidden(hidden) sk.save_prior_epoch() return total_loss.item() / ds.data_size
def train(epoch): global tot_steps # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = ds.ntokens hidden = model.init_hidden(ds.batch_size) batch, i = 0, 0 if (epoch % args.grad_interval == 0 or epoch == 1) and \ (args.save_grad or args.save_gradPure): embed.requires_grad = True save_grad, save_gradPure = args.save_grad, args.save_gradPure else: if embed and embed.requires_grad: embed.requires_grad = False save_grad, save_gradPure = False, False for data, targets in train_seq(): # shape of data is (bptt, batch_size) targets = targets.view(-1).contiguous() seq_len = args.bptt lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + \ sum(args.beta * (rnn_h[1:] - rnn_h[:-1] ).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward(retain_graph=args.save_gradPure) sk.add_sample(epoch, i, loss.item()) if save_grad: with torch.no_grad(): # shape(btpp, batch_size, voc_size) grad = embed.last_oh.grad # shape(bptt, batch_size, 1, embed_size) res = torch.stack([ torch.stack([ torch.mm( grad[token_i, batch_i].view(1, -1), 1 / (embed.last_weight * args.emsize + sys.float_info.epsilon)) for batch_i in range(args.batch_size) ], dim=0) for token_i in range(args.bptt) ], dim=0) assert list( res.shape) == [args.bptt, args.batch_size, 1, args.emsize] sk.add_data("grad", epoch, i, res.detach().cpu().numpy().tolist()) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if i % args.log_interval == 0 and i > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time ppl = math.exp(cur_loss) bpc = cur_loss / math.log(2) logger.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, i, ds.nbatch, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, ppl, bpc)) save_tb(tb, "train/loss", tot_steps, cur_loss) save_tb(tb, "train/ppl", tot_steps, ppl) total_loss = 0 start_time = time.time() ### ### if save_gradPure: optimizer.zero_grad() embed.last_oh.grad.zero_() output.sum().backward() with torch.no_grad(): # shape(btpp, batch_size, voc_size) grad = embed.last_oh.grad res = torch.stack([ torch.stack([ torch.mm( grad[token_i, batch_i].view(1, -1), 1 / (embed.last_weight * args.emsize + sys.float_info.epsilon)) for batch_i in range(args.batch_size) ], dim=0) for token_i in range(args.bptt) ], dim=0) sk.add_data("gradPure", epoch, i, res.detach().cpu().numpy().tolist()) tot_steps += 1 i += 1 if tot_steps in args.when_steps: logger.info(f'(Step {tot_steps}) Saving model before learning ' 'rate decreased') model_save('{}.e{}'.format("model.pt", epoch)) logger.info('Dividing learning rate by 10') optimizer.param_groups[0]['lr'] /= 10. if tot_steps >= args.max_steps: logger.info(f"Reached max-steps at tot step {tot_steps}, breaking " "the train function") break