def train(epoch): print('\nEpoch: %d' % epoch) net.train() stats = adl.Accumulator() for inputs, targets in trainloader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() stats["loss_sum"] += loss.item() * targets.size(0) _, predicted = outputs.max(1) stats["total"] += targets.size(0) stats["correct"] += predicted.eq(targets).sum().item() trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/") net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/") with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Train", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch) print("Train:", stats)
def valid(epoch): net.eval() stats = adl.Accumulator() with torch.no_grad(): for inputs, targets in validloader: inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) stats["loss_sum"] += loss.item() * targets.size(0) _, predicted = outputs.max(1) stats["total"] += targets.size(0) stats["correct"] += predicted.eq(targets).sum().item() with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Valid", stats["accuracy"], epoch) if adaptdl.env.replica_rank() == 0: nni.report_intermediate_result(stats["accuracy"]) print("Valid:", stats) return stats["accuracy"]
def train(epoch): print('\nEpoch: %d' % epoch) net.train() stats = adl.Accumulator() for inputs, targets in trainloader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() stats["loss_sum"] += loss.item() * targets.size(0) _, predicted = outputs.max(1) stats["total"] += targets.size(0) stats["correct"] += predicted.eq(targets).sum().item() writer.add_scalar("Throughput/Gain", net.gain, epoch) writer.add_scalar("Throughput/Global_Batchsize", trainloader.current_batch_size, epoch) with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Train", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch) print("Train:", stats)
def evaluate(eval_model, val_iter): eval_model.eval() # Turn on the evaluation mode stats = adl.Accumulator() ntokens = len(TEXT.vocab.stoi) with torch.no_grad(): for batch in val_iter: output = eval_model(batch.text.to(device)) output_flat = output.view(-1, ntokens) stats["loss_sum"] += batch.text.size(1) * \ criterion(output_flat, batch.target.view(-1).to(device)).item() stats["total"] += batch.target.size(1) with stats.synchronized(): loss_avg = stats["loss_avg"] = stats["loss_sum"] / stats["total"] writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch) print("Valid:", stats) return loss_avg
def train(self, train_data, epoch, writer): stats = adl.Accumulator() self.model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(TEXT.vocab.stoi) for i, batch in enumerate( AdaptiveBPTTIterator( train_data, batch_size=args.bs, bptt_len=args.bptt, max_batch_size=self.max_batch_size, # noqa: E501 local_bsz_bounds=self.local_bsz_bounds)): # noqa: E501 self.optimizer.zero_grad() output = self.model(batch.text.to(device)) loss = self.criterion(output.view(-1, ntokens), batch.target.view(-1).to(device)) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) self.optimizer.step() total_loss += loss.item() stats["loss_sum"] += loss.item() * batch.target.size(1) stats["total"] += batch.target.size(1) writer.add_scalar("Throughput/Gain", self.model.gain, epoch) log_interval = 10 if i % log_interval == 0 and i > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print(f'| epoch {epoch:3d} | batch {i:5d} | ' f'lr {self.scheduler.get_lr()[0]:02.2f} | ' f'ms/batch {elapsed * 1000 / log_interval:7.2f} | ' f'loss {cur_loss:5.2f} | ppl {np.exp(cur_loss):8.2f}') total_loss = 0 start_time = time.time() with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] writer.add_scalar("Loss/Train", stats["loss_avg"], epoch) print("Train:", stats)
def evaluate(self, eval_model, data_source, epoch=0, writer=None): eval_model.eval() # Turn on the evaluation mode stats = adl.Accumulator() ntokens = len(TEXT.vocab.stoi) with torch.no_grad(): for batch in AdaptiveBPTTIterator(data_source, batch_size=args.bs, bptt_len=args.bptt): output = eval_model(batch.text.to(device)) output_flat = output.view(-1, ntokens) stats["loss_sum"] += batch.text.size(1) * \ self.criterion(output_flat, batch.target.view(-1).to(device)).item() stats["total"] += batch.target.size(1) with stats.synchronized(): loss_avg = stats["loss_avg"] = stats["loss_sum"] / stats["total"] if writer: writer.add_scalar("Loss/Valid", stats["loss_avg"], epoch) print("Valid:", stats) return loss_avg
def train(epoch): print('\nEpoch: %d' % epoch) net.train() stats = adl.Accumulator() for inputs, targets in trainloader: optimizer.zero_grad() if args.mixed_precision: inputs, targets = inputs.to(device), targets.to(device) with torch.cuda.amp.autocast(): outputs = net(inputs) loss = criterion(outputs, targets) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() else: inputs, targets = inputs.to(device), targets.to(device) outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() stats["loss_sum"] += loss.item() * targets.size(0) _, predicted = outputs.max(1) stats["total"] += targets.size(0) stats["correct"] += predicted.eq(targets).sum().item() trainloader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/") net.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/") if args.mixed_precision: writer.add_scalar("MixedPrecision/scale", scaler.get_scale(), epoch) with stats.synchronized(): stats["loss_avg"] = stats["loss_sum"] / stats["total"] stats["accuracy"] = stats["correct"] / stats["total"] writer.add_scalar("Loss/Train", stats["loss_avg"], epoch) writer.add_scalar("Accuracy/Train", stats["accuracy"], epoch) print("Train:", stats)
network.zero_grad() prediction = network(user, item) loss = loss_function(prediction, label) loss.backward() optimizer.step() count += 1 gain = network.gain batchsize = train_loader.current_batch_size accumulation_steps = train_loader.accumulation_steps train_loader.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Data/") network.to_tensorboard(writer, epoch, tag_prefix="AdaptDL/Model/") network.eval() stats = adl.Accumulator() HR, NDCG = evaluate.metrics(network, test_loader, args.top_k) stats['HR'] += HR stats['replicas'] += 1.0 with stats.synchronized(): writer.add_scalar('Loss/HR', stats['HR'] / stats['replicas'], epoch) elapsed_time = time.time() - start_time print("The time elapse of epoch {:03d}".format(epoch) + " is: " + time.strftime("%H: %M: %S", time.gmtime(elapsed_time))) print("HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(HR), np.mean(NDCG))) if HR > best_hr: best_hr, best_ndcg, best_epoch = HR, NDCG, epoch if args.out and adaptdl.env.replica_rank() == 0:
def train(epoch): iters = 0 # For each batch in the dataloader stats = adl.Accumulator() for i, data in enumerate(dataloader, 0): data = data[0] ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### ## Train with all-real batch netD.zero_grad() # Format batch real_cpu = data.to(device) b_size = real_cpu.size(0) label = torch.full((b_size, ), real_label, device=device) # Forward pass real batch through D output = netD(real_cpu).view(-1) # Calculate loss on all-real batch errD_real = criterion(output, label) # Calculate gradients for D in backward pass errD_real.backward() D_x = output.mean().item() ## Train with all-fake batch # Generate batch of latent vectors noise = torch.randn(b_size, nz, 1, 1, device=device) # Generate fake image batch with G fake = netG(noise) label.fill_(fake_label) # Classify all fake batch with D output = netD(fake.detach()).view(-1) # Calculate D's loss on the all-fake batch errD_fake = criterion(output, label) # Calculate the gradients for this batch errD_fake.backward() D_G_z1 = output.mean().item() # Add the gradients from the all-real and all-fake batches errD = errD_real + errD_fake # Update D optimizerD.step() ############################ # (2) Update G network: maximize log(D(G(z))) ########################### netG.zero_grad() label.fill_(real_label) # fake labels are real for generator cost # Since we just updated D, perform another forward pass of all-fake batch through D output = netD(fake).view(-1) # Calculate G's loss based on this output errG = criterion(output, label) # Calculate gradients for G errG.backward() D_G_z2 = output.mean().item() # Update G optimizerG.step() # Save Losses for plotting later G_losses.append(errG.item()) D_losses.append(errD.item()) stats["g_loss_sum"] += errG.item() stats["d_loss_sum"] += errD.item() stats["norm"] += metrics._metrics_state().grad_params[0] stats["var"] += metrics._metrics_state().grad_params[1] stats["replicas"] += 1.0 scheduleD.step() scheduleG.step() with stats.synchronized(): with SummaryWriter(adaptdl.get_tensorboard_dir()) as writer: writer.add_scalar("Loss/G", stats["g_loss_sum"] / stats["replicas"], epoch) writer.add_scalar("Loss/D", stats["d_loss_sum"] / stats["replicas"], epoch) writer.add_scalar("Performance/GlobalBatchsize", b_size * stats["replicas"], epoch) writer.add_scalar("Performance/Replicas", stats["replicas"], epoch) writer.add_scalar("Stats/Variance", stats["norm"] / stats["replicas"], epoch) writer.add_scalar("Stats/Norm", stats["var"] / stats["replicas"], epoch)