def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) #hidden = model.init_hidden(args.batch_size) count = 0 for i, batch in enumerate(train_data): data, targets, seq = batch['inp'], batch['tar'], batch['lens'] hidden = model.init_hidden(args.batch_size) model.zero_grad() output, hidden = model(data, seq, hidden) loss = criterion(output.view(-1, ntokens), targets.view(-1)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if i % args.log_interval == 0 and i > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, i, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(global_step): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) if args.model != 'Transformer': hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): batch_start = time.time() data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() if args.model == 'Transformer': output = model(data) else: hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() elapsed_secs = time.time() - batch_start global_step += 1 if hvd.rank() == 0 and global_step >= 1000: print(" %d %.6f " % (global_step, elapsed_secs)) if global_step >= 1600: break return global_step
def train(): model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(train_batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) model.zero_grad() hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) optimizer = torch.optim.Adam(model.parameters(), args.lr, amsgrad=True) if args.model != 'Transformer': hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) model.zero_grad() output = model(data) loss = criterion(output, targets) loss.backward() optimizer.step() for p in model.parameters(): p.data.add_(p.grad, alpha=-lr) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() if args.dry_run: break
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() for batch, i in enumerate(range(0, len(train_data))): data, lens, targets = get_batch(train_data, train_lens, train_tgt, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() output = model(data, lens) acc, recall, prec = metric(output, targets) loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # for p in model.parameters(): # p.data.add_(-lr, p.grad.data) optim.step() total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print (lr, prec) print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | acc {:5.2f} | recall {:5.2f} | prec {:5.2f}'.format( epoch, batch, len(train_data), lr, elapsed * 1000 / args.log_interval, cur_loss, acc, recall, prec)) total_loss = 0 start_time = time.time()
def train(): total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() clipped_lr = lr * clip_gradient(model, args.clip) for p in model.parameters(): p.data.add_(-clipped_lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def get_onehot_grad(model, batch): extracted_grads = {} def get_grad_hook(name): def hook(grad): extracted_grads[name] = grad return hook model.eval() v, q, a, idx, q_len = batch batch_size, length = q.shape v = Variable(v).cuda() q = Variable(q).cuda() q_len = Variable(q_len).cuda() out = model(v, q, q_len, embed_grad_hook=get_grad_hook('embed')) embed = model.module.text.embedding(q) pred = torch.max(out, 1)[1] loss = F.nll_loss(F.log_softmax(out, 1), pred) model.zero_grad() loss.backward() embed_grad = extracted_grads['embed'] onehot_grad = embed.view(-1) * embed_grad.view(-1) onehot_grad = onehot_grad.view(batch_size, length, -1).sum(-1) return onehot_grad
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) # hidden = model.reset_hidden() s = 0 total = sum(tic_marks) for i in range(len(tic_marks)): data, targets = get_batch(train_data, i) s += data.size(0) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data print '%d/%d' % (s, total)
def train(): total_loss = 0 start_time = time.time() ntokens = corpus.vocabulary.num_words hidden = model.init_hidden(BATCH_SIZE) for batch, i in enumerate(range(0, train_data.size(0) - 1, SEQUENCE_LEN)): data, targets = get_batch(train_data, i) hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() optimizer.step() # clipped_lr = lr * clip_gradient(model, GRADIENT_CLIP) # for p in model.parameters(): # p.data.add_(-clipped_lr, p.grad.data) total_loss += loss.data if batch % LOG_INTERVAL == 0 and batch > 0: cur_loss = total_loss.item() / LOG_INTERVAL elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // SEQUENCE_LEN, lr, elapsed * 1000 / LOG_INTERVAL, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout and bn model.turn() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the ay to start of the dataset!!! hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs/ LSTMs torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '|epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}' .format(epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) #调用forward方法 loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:04.4f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): global lr global best_val_loss # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) eval_start_time = time.time() hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() # Evaluate every args.eval_interval batches if batch % args.eval_interval == 0 and batch > 0: val_t_loss = evaluate(val_t_data) val_f_loss = evaluate(val_f_data) logging('-' * 89) logging( '| eval {:3d} in epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} {:5.2f} | ' 'valid ppl {:8.2f} {:8.2f}'.format( batch // args.eval_interval, epoch, (time.time() - eval_start_time), val_t_loss, val_f_loss, math.exp(val_t_loss), math.exp(val_f_loss))) logging('-' * 89) eval_start_time = time.time() # Save the model if the validation loss is the best we've seen so far. if best_val_loss is None or val_t_loss < best_val_loss: torch.save(model, os.path.join(args.save, 'model.pt')) best_val_loss = val_t_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 model.train()
def train(model, train_vars, train_labels, val_vars, test_labels): iterations = 400 model.training = True loss_fn = torch.nn.BCELoss() learning_rate = 1e-3 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.00012) for t in range(iterations): prediction = model(train_vars) loss = loss_fn(prediction, train_labels) model.zero_grad() # Zero out the previous gradient computation loss.backward() # Compute the gradient optimizer.step() # Use the gradient information to # make a step if t % 10 == 0: print("iter: {} --------".format(t)) print(" loss: {:.4f}".format(loss.data[0])) print(" acc: {:.4f}".format(testNN(model, val_vars, test_labels))) return model
def train(): total_loss = 0. start_time = time.time() for i in range(0, train_data.size(1) - args.seq_size - 1): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() output = model(data) loss = criterion(output, targets) loss.backward() optimizer.step() total_loss += loss.item() if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, i, train_data.size(1) - args.seq_size, args.lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() if args.dry_run: break
def train(): model.train() # Turn on training mode, which enables dropout. total_loss = 0. start_time = time.time() hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data_seq, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data_seq, hidden) loss = criterion(output.view(-1, len(corpus.dictionary)), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( f'| epoch {epoch:3d} | {batch:5d}/{len(train_data) // args.bptt:5d} batches ' f'| lr {lr:02.2f} | ms/batch {elapsed * 1000 / args.log_interval:5.2f} ' f'| loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}') total_loss = 0 start_time = time.time()
def train(self, model, train_data): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(self.corpus.dictionary) hidden = model.init_hidden(self.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, self.bptt)): data, targets = self.get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() hidden = self.repackage_hidden(hidden) output, hidden = model(data, hidden) loss = self.criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), self.clip) for p in model.parameters(): p.data.add_(p.grad, alpha=-self.lr) total_loss += loss.item() if batch % self.log_interval == 0 and batch > 0: cur_loss = total_loss / self.log_interval elapsed = time.time() - start_time print('| {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( batch, len(train_data) // self.bptt, self.lr, elapsed * 1000 / self.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() # 33278 ntokens = len(corpus.dictionary) # 以batch_size大小初始化隐藏层参数 # 以下是在RNN的条件下,当在LSTM的情况下时,hidden为tuple # type(hidden) == > Tensor, hidden.size() == > [2, 20, 200] # args.batch_size为20 hidden = model.init_hidden(args.batch_size) # args.bptt: sequence length # train_data.size(0)表示总共有多少个batch(与下面代码中的batch不同), train_data.size(1) # 表示batch_size的大小, bptt是序列长度, 也就是一次取多少个batch count = 0 for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): # type(data)==>Tensor, data.size()===> 35*20, not zero # type(targets)==>Tensor, targets.size()===> 700, not zero # args.bptt is 35 data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. # 参数中的hidden的size表示为(nlayers, bsz, nhid) # 此时得到的hidden算是原来hidden的副本 hidden = repackage_hidden(hidden) # Sets gradients of all model parameters to zero. model.zero_grad() # 参数:(35*20, 2*20*200), output=(35,20,33278) hidden=(2,20,200) # 35被称为sequence lenegth 20被称为batch_size 33278被称为num_directions*hidden_size # 上面的话可能有问题 output, hidden = model(data, hidden) # output.view(-1, ntokens) == > 700*33278 loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() print("Done!") count += 1 if count == 10: break
def train_batch(model, source, target, optimizer, criterion, attn=False): loss = 0 model.zero_grad() if attn: output, hidden, attention = model(source, target, use_target=True) else: output, hidden = model(source, target, use_target=True) # Take output minus the last character output = output[:-1, :, :] # Take target without the first character target = target[1:, :] output_flat = output.view( -1, model.output_size) # [(tg_len x batch) x en_vocab_sz] # not sure whether to use ground truth target or network's prediction loss = criterion(output_flat, target.view(-1)) loss.backward() # figure out how to do this # if L2 Norm of Gradient / 128 > 5, then g = 5g/s nn.utils.clip_grad_norm(model.parameters(), CLIP) optimizer.step() # Subtract one so that the padding count becomes zero. Count number of nonpadding in output non_padding = (target.view(-1) - 1.0).nonzero().size(0) return loss.data[0], non_padding
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('[epoch %d, batch %d/%d], lr %.2f, avg_batch_cost: %.5f s, ' 'loss %.2f, ppl %.2f' % (epoch, batch, len(train_data) // args.bptt, lr, elapsed / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, labels = get_batch(train_data, train_labels, corpus.train_seq_lens, i) hidden = model.init_hidden(args.batch_size) model.zero_grad() output, _ = model(data, hidden) mask = (data >= 0).float() loss, _ = model.loss(output, labels, mask) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(num_epochs, model, device, train_loader, val_loader, images, texts, lengths, converter, optimizer, lr_scheduler, prediction_dir, print_iter, opt) : # criterion = CTCLoss() # criterion.to(device) criterion = torch.nn.CrossEntropyLoss(ignore_index = 0).to(device) images = images.to(device) model.to(device) for epoch in range(num_epochs) : count = 0 model.train() for i, datas in enumerate(train_loader) : datas, targets = datas batch_size = datas.size(0) count += batch_size dataloader.loadData(images, datas) t, l = converter.encode(targets, opt.batch_max_length) dataloader.loadData(texts, t) dataloader.loadData(lengths, l) preds = model(images, t[:, :-1]) # preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds.view(-1, preds.shape[-1]), t[:, 1:].contiguous().view(-1)) model.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() if count % print_iter < train_loader.batch_size : print('epoch {} [{}/{}]loss : {}'.format(epoch, count, len(train_loader.dataset), cost)) if (epoch %3==0) &(epoch !=0 ): res = validation(model, device, val_loader, images, texts, lengths, converter, prediction_dir, opt) save_model(opt.save_dir, f'{epoch}_{round(float(res),3)}', model, optimizer, lr_scheduler, opt)
def train(): # Turn on training mode which enables dropout. model.train() random.shuffle(corpus.textind['train']) total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) for batch, i in enumerate( range(0, len(corpus.textind['train']), args.batch_size)): data, lengths = corpus.get_batch(args.batch_size, False, i, 'train') data = data.to(device) lengths = lengths.to(device) hidden = model.init_hidden(data.shape[1]) loss = 0 masksum = 0 model.zero_grad() for seqind, j in enumerate(range(0, data.shape[0] - 1, args.bptt)): # data.shape[0] - 1 to not let EOU pass as input # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. ei = min(j + args.bptt, data.shape[0] - 1) hidden = repackage_hidden(hidden) partoutput, hidden = model(data[j:ei], hidden) lossmat = criterion(partoutput.transpose(1, 2), data[j + 1:ei + 1]) if (lengths >= ei).sum() == lengths.shape[0]: temploss = lossmat.sum() tempmasksum = lossmat.shape[0] * lossmat.shape[1] else: mask = (torch.arange(ei - j).to(device).expand( len(lengths), ei - j) < (lengths - j).unsqueeze(1)).t().float() temploss = (lossmat * mask).sum() tempmasksum = mask.sum() loss += temploss.data masksum += tempmasksum.data (temploss / tempmasksum).backward() loss /= masksum # loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss if batch % args.log_interval == 0 and batch != 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.3f} | ppl {:8.3f}'.format( epoch, batch, len(corpus.textind['train']) // args.batch_size, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) sys.stdout.flush() total_loss = 0 start_time = time.time()
def train(model, train_data, lr): # Turn on training mode which enables dropout. model.train() model.set_mode('train') total_loss = 0. start_time = time.time() ntokens = len(dictionary) hidden = model.init_hidden(args.batch_size) optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=args.wdecay) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() # gs534 add sentence resetting eosidx = dictionary.get_eos() if args.loss == 'nce': output, hidden = model(data, hidden, eosidx, targets) loss = criterion(output) loss.backward() else: output, hidden = model(data, hidden, separate=args.reset, eosidx=eosidx) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) #for p in model.parameters(): # p.data.add_(-lr, p.grad.data) optimizer.step() total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time if args.loss == 'nce': print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'nce_loss {:5.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss)) else: print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) if (not args.single) and (torch.cuda.device_count() > 1): # "module" is necessary when using DataParallel hidden = model.module.init_hidden(args.batch_size) else: hidden = model.init_hidden(args.batch_size) # UNCOMMENT FOR DEBUGGING #random.seed(10) if train_ccg_data is None: order = list(range(0, train_lm_data.size(0) - 1, args.bptt)) else: order = list( range(0, train_lm_data.size(0) + train_ccg_data.size(0) - 1, args.bptt)) random.shuffle(order) for batch, i in enumerate( order ): #enumerate(range(0, train_lm_data.size(0) + train_ccg_data.size(0) - 1, args.bptt)): # if batch >= 1000: break # TAG if i > train_lm_data.size(0): data, targets = get_batch(train_ccg_data, i - train_lm_data.size(0)) # LM else: data, targets = get_batch(train_lm_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() #data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(order), lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): batch_cnt = 0 train_loss = 0 # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) if args.model != 'Transformer': hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() if args.model == 'Transformer': output = model(data) output = output.view(-1, ntokens) else: hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(p.grad, alpha=-lr) total_loss += loss.item() batch_cnt += 1 train_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) train_loss_per_x_batch.add_data_point({ 'epoch_idx': epoch, 'batch_idx': batch, 'training_loss': cur_loss }) total_loss = 0 start_time = time.time() if args.dry_run: break train_loss /= batch_cnt return train_loss
def train(adam_lr): # Turn on training mode which enables dropout. optimizer = optim.Adam(model.parameters(), lr=adam_lr) model.train() total_loss = 0. total_epoch_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) shuffle_x, shuffle_y = shuffle_train(train_data) #if args.model != 'Transformer': # hidden = model.init_hidden(args.batch_size) #for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): #for batch, i in enumerate(range(0, train_data.size(0)- args.bptt + 1)): for sliding window of step 1 for idx in range(shuffle_x.size(0)): #data, targets = get_batch(train_data, i) #data, targets = get_batch_ffn(train_data, i) batch = idx data = shuffle_x[idx] targets = shuffle_y[idx] #print (data[:1]) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() #optimizer.zero_grad() if args.model == 'Transformer': output = model(data) else: #hidden = repackage_hidden(hidden) #what is hidden? #pdb.set_trace() #output, hidden = model(data, hidden) output = model(data) #loss = criterion(output.view(-1, ntokens), targets) loss = criterion(output, targets) loss.backward() optimizer.step() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. #torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) #for p in model.parameters(): # p.data.add_(-lr, p.grad.data) total_loss += loss.item() total_epoch_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time #print('cur_loss: ', cur_loss) print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, shuffle_x.size(0), lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) #pdb.set_trace() total_loss = 0 start_time = time.time() return total_epoch_loss / (batch + 1)
def train(optimizer=None, h_sp=[0., 0.], h_th=[0., 0.], block=-1, lr=args.lr): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) if not optimizer: model.zero_grad() else: optimizer.zero_grad() output, hidden = model(data, hidden, sparse=True, h_sp=h_sp, h_th=h_th, block=block) loss = criterion(output.view(-1, ntokens), targets) # Add l1 regularization if args.l1: l1_regularization = torch.tensor(0, dtype=torch.float32).to(device) lambda_l1 = torch.tensor(args.l1_lambda).to(device) for param in model.parameters(): l1_regularization += torch.norm(param, 1).to(device) loss = loss + lambda_l1 * l1_regularization loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if not optimizer: for p in model.parameters(): p.data.add_(-lr, p.grad.data) else: optimizer.step() lr = get_lr(optimizer) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time logger.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:5.2e} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(device, dataset, dataloader, model): print("in train") model = model.to(device) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) # Training loop images_per_batch = {} batch_count, images_per_batch['train'], images_per_batch[ 'test'] = 0, [], [] with tqdm(dataloader, total=config.num_batches) as pbar: for batch_idx, batch in enumerate(pbar): model.zero_grad() train_inputs, train_targets = batch['train'] train_inputs = train_inputs.to(device=device) train_targets = train_targets.to(device=device) train_embeddings = model(train_inputs) test_inputs, test_targets = batch['test'] test_inputs = test_inputs.to(device=device) test_targets = test_targets.to(device=device) test_embeddings = model(test_inputs) prototypes = get_prototypes(train_embeddings, train_targets, dataset.num_classes_per_task) loss = prototypical_loss(prototypes, test_embeddings, test_targets) loss.backward() optimizer.step() #Just keeping the count here batch_count += 1 images_per_batch['train'].append(train_inputs.shape[1]) images_per_batch['test'].append(test_inputs.shape[1]) with torch.no_grad(): accuracy = get_accuracy(prototypes, test_embeddings, test_targets) pbar.set_postfix(accuracy='{0:.4f}'.format(accuracy.item())) if batch_idx >= config.num_batches: break print("Number of batches in the dataloader: ", batch_count) # Save model if check_dir() is not None: filename = os.path.join( 'saved_models', 'protonet_cifar_fs_{0}shot_{1}way.pt'.format(config.k, config.n)) with open(filename, 'wb') as f: state_dict = model.state_dict() torch.save(state_dict, f) print("Model saved") return batch_count, images_per_batch
def train(): global lr, best_val_loss # Turn on training mode which enables dropout. model.train() total_loss, nbatches = 0, 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) if not args.lm1b: data_gen = corpus.iter('train', args.batch_size, args.bptt, use_cuda=args.cuda) else: data_gen = train_corpus.batch_generator(seq_length=args.bptt, batch_size=args.batch_size) for b, batch in enumerate(data_gen): model.train() if args.lm1b: source, target, word_cnt, batch_len = get_batch(batch) else: source, target = batch # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() # optimizer.zero_grad() model.softmax.set_target(target.data.view(-1)) output, hidden = model(source, hidden) loss = criterion(output, target.view(-1)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # for p in model.parameters(): # if p.grad is not None: # p.data.add_(-lr, p.grad.data) total_loss += loss.data.cpu() # logging.info(total_loss) if b % args.log_interval == 0 and b > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time if not args.valid_per_epoch: val_loss = evaluate('valid') logging.info('| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | valid loss {:5.2f} | valid ppl {:8.2f}'.format( epoch, b, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), val_loss, math.exp(val_loss))) else: logging.info('| epoch {:3d} | batch {:5d} | lr {:02.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} '.format( epoch, b, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def generate_targeted(model, lr, label, n_try, target_image, target_label, idx): if n_try == 0: # try to generate an example n number of times return lr_orig = lr lambda_img = 5.0 criterion = nn.MSELoss().to(device) # having a well centered gaussian stops the output gaussian from saturating early data = torch.randn((1, 1, size_input, size_input)) data = (data - torch.min(data)) / (torch.max(data) - torch.min(data)) # data = torch.zeros((1,1,size_input,size_input)) target = torch.zeros((1, size_output)) target[0][label] = 1 data, target = data.to(device), target.to(device) for i in range(epochs): data = Variable(data.data, requires_grad=True) output = torch.sigmoid(model(data)) pred = output.max(1, keepdim=True)[1] # if i % report_every == 0 or i == epochs-1: # print(output.data.cpu().numpy()[0]) if (i + 1) % (epochs / (num_sched_lr + 1)) == 0 and label == pred: lr /= 10.0 print("LR sched lap") loss = criterion(output, target) + lambda_img * criterion(data, target_image) model.zero_grad() loss.backward() data = data - lr * data.grad.data data = (data - torch.min(data)) / (torch.max(data) - torch.min(data)) if model(data).max(1, keepdim=True)[1] == label: print("Success. " + str(label)) else: print("Fail. Retry. " + str(label)) return generate_targeted(model, lr_orig, label, n_try - 1, target_image, target_label, idx) data = data.view(1, size_input, size_input).detach().cpu() trans = torchvision.transforms.ToPILImage() data = trans(data) data.save("adversarial_outputs_conv/targeted/" + str(idx) + str(target_label) + "_to_" + str(label) + ".jpg")