def val_loss_func(data_source): val_hidden = model.init_hidden(args.batch_size) # model.eval() # Turn on evaluation mode which disables dropout. TODO: Do we want this here? ntokens = len(corpus.dictionary) seq_len = args.bptt data, targets = get_batch(data_source, 0, args, seq_len=seq_len) val_hidden = repackage_hidden(val_hidden) output, val_hidden, rnn_hs, dropped_rnn_hs = model(data, val_hidden, return_h=True) val_loss = criterion(output.view(-1, ntokens), targets) return val_loss
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() with torch.no_grad(): total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) total_loss += len(data) * criterion(output.view(-1, ntokens), targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def train_loss_func(): train_hidden = model.init_hidden(args.batch_size) ntokens = len(corpus.dictionary) seq_len = args.bptt # Turn on training mode which enables dropout. model.train() data, targets = get_batch(train_data, 0, args, seq_len=seq_len) output, train_hidden, rnn_hs, dropped_rnn_hs = model(data, train_hidden, return_h=True) xentropy_loss = criterion(output.view(-1, ntokens), targets) loss = xentropy_loss if args.wdecay_type in ['global', 'per_layer', 'per_param']: loss = loss + model.L2_loss() return xentropy_loss, loss
def train(): global global_iteration total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 train_losses = [] while i < train_data.size(0) - 1 - 1: seq_len = args.bptt lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() # Optionally perturb hyperparameter values (via Gaussian or sinusoid perturbations) # --------------------------------------------------------------------------------- for hparam in ['dropoute', 'dropouti', 'dropouth', 'dropouto']: if getattr(args, 'perturb_' + hparam): if args.perturb_type == 'gaussian': gaussian_perturbation = args.perturb_std * np.random.randn( ) use_hparam_value = getattr(args, hparam) + gaussian_perturbation elif args.perturb_type == 'sinusoid': use_hparam_value = getattr( args, hparam) + args.amplitude * np.sin( multipliers[hparam] * 2 * np.pi) multipliers[hparam] += args.multiplier_increment use_hparam_value = np.clip(use_hparam_value, 0.0, 0.99) setattr(model, hparam, use_hparam_value) # --------------------------------------------------------------------------------- output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data train_losses.append(raw_loss.item()) optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time val_loss = evaluate(val_data, eval_batch_size) print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | trn ppl {:8.2f} | val ppl {:8.2f} | bpc {:8.3f}' .format(epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), math.exp(val_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len global_iteration += 1 return np.mean(train_losses)