def test_lookahead_pred2(): net = MockNet() input = Variable(torch.randn((5, 3))) # Zero-weights, zero-biases net.fc.weight.data.normal_(0.0, 1.0) net.fc.bias.data.fill_(0.0) pred = PredOpt(net.parameters()) result1 = net(input) # No step() yet => the same result with pred.lookahead(2.0): result2 = net(input) assert (np.all(np.isclose(result1.data.numpy(), result2.data.numpy())))
def test_lookahead_pred(): net = MockNet() input = Variable(torch.randn((5, 3))) # Zero-weights, zero-biases net.fc.weight.data.fill_(0.0) net.fc.bias.data.fill_(0.0) pred = PredOpt(net.parameters()) # Update weights net.fc.weight.data.normal_(0, 1.0) pred.step() result1 = net(input) # Lookahead 0.0 => the same results with pred.lookahead(0.0): result2 = net(input) assert (np.all(np.isclose(result1.data.numpy(), result2.data.numpy()))) # Lookahead 1.0 => doubled results with pred.lookahead(1.0): result3 = net(input) assert (np.all( np.isclose((2.0 * result1).data.numpy(), result3.data.numpy()))) # Outside of 'with' statements => the same results result4 = net(input) assert (np.all(np.isclose(result1.data.numpy(), result4.data.numpy())))
def test_param_update(): net = MockNet() net.fc.weight.data.fill_(0.0) net.fc.bias.data.fill_(0.0) pred = PredOpt(net.parameters()) # Update weights net.fc.weight.data.fill_(1.0) # 0.0 => 1.0 (Increased by 1.0) net.fc.bias.data.fill_(0.5) # 0.0 => 0.5 (Increased by 0.5) pred.step() with pred.lookahead(1.0): assert (net.fc.weight.data[0, 0] == 2.0) # 1.0 + 1.0 * 1.0 assert (net.fc.bias.data[0] == 1.0) # 0.5 + 0.5 * 1.0 assert (net.fc.weight.data[1, 1] == 1.0 ) # Went back to the correct value (1.0) assert (net.fc.bias.data[1] == 0.5) # Went back to the correct value (1.0) with pred.lookahead(5.0): assert (net.fc.weight.data[2, 2] == 6.0) # 1.0 + 1.0 * 5.0 assert (net.fc.bias.data[2] == 3.0) # 0.5 + 0.5 * 5.0
fake_label = 0 if opt.cuda: netD.cuda() netG.cuda() criterion.cuda() input, label = input.cuda(), label.cuda() noise, fixed_noise = noise.cuda(), fixed_noise.cuda() fixed_noise = Variable(fixed_noise) # setup optimizer optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizer_predD = PredOpt(netD.parameters()) optimizer_predG = PredOpt(netG.parameters()) if opt.pred: print( 'Prediction of D and G is enabled (see https://openreview.net/forum?id=Skj8Kag0Z¬eId=rkLymJTSf)' ) lookahead_step = 1.0 else: lookahead_step = 0.0 for epoch in range(opt.niter): for i, data in enumerate(dataloader, 0): ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ###########################
def __init__(self, opt, nc): self.opt = opt ################################################################ # Initializing Generator and Discriminator Networks ################################################################ self.nz = int(opt.nz) ngf = int(opt.ngf) ndf = int(opt.ndf) self.device = 'cuda' if opt.cuda else 'cpu' self.G = _netG(opt.ngpu, self.nz, ngf, nc).to(self.device) self.G.apply(weights_init) if opt.netG != '': self.G.load_state_dict(torch.load(opt.netG)) self.D = _netD(opt.ngpu, ndf, nc).to(self.device) self.D.apply(weights_init) if opt.netD != '': self.D.load_state_dict(torch.load(opt.netD)) if opt.verbose and (not self.opt.distributed or dist.get_rank() == 0): print(self.G) print(self.D) ################################################################ # Initialize Loss Function ################################################################ self.criterion = nn.BCELoss().to(self.device) ################################################################ # Set Prediction Enabled Adam Optimizer settings ################################################################ # self.optimizerD = AdamPre(self.D.parameters(), lr=opt.lr/opt.DLRatio, # betas=(opt.beta1, 0.999), name='optD') # self.optimizerG = AdamPre(self.G.parameters(), lr=opt.lr/opt.GLRatio, # betas=(opt.beta1, 0.999), name='optG') self.optimizerD = optim.Adam(self.D.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizerG = optim.Adam(self.G.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizer_predD = PredOpt(self.D.parameters()) self.optimizer_predG = PredOpt(self.G.parameters()) ################################################################ # Handle special Distributed training modes ################################################################ self.verbose = opt.verbose if opt.distributed: if opt.cuda: self.D = torch.nn.parallel.DistributedDataParallel(self.D) self.G = torch.nn.parallel.DistributedDataParallel(self.G) self.verbose = opt.verbose and dist.get_rank() == 0 else: self.D = torch.nn.parallel.DistributedDataParallelCPU(self.D) self.G = torch.nn.parallel.DistributedDataParallelCPU(self.G) self.verbose = opt.verbose and dist.get_rank() == 0
class DCGAN(): def __init__(self, opt, nc): self.opt = opt ################################################################ # Initializing Generator and Discriminator Networks ################################################################ self.nz = int(opt.nz) ngf = int(opt.ngf) ndf = int(opt.ndf) self.device = 'cuda' if opt.cuda else 'cpu' self.G = _netG(opt.ngpu, self.nz, ngf, nc).to(self.device) self.G.apply(weights_init) if opt.netG != '': self.G.load_state_dict(torch.load(opt.netG)) self.D = _netD(opt.ngpu, ndf, nc).to(self.device) self.D.apply(weights_init) if opt.netD != '': self.D.load_state_dict(torch.load(opt.netD)) if opt.verbose and (not self.opt.distributed or dist.get_rank() == 0): print(self.G) print(self.D) ################################################################ # Initialize Loss Function ################################################################ self.criterion = nn.BCELoss().to(self.device) ################################################################ # Set Prediction Enabled Adam Optimizer settings ################################################################ # self.optimizerD = AdamPre(self.D.parameters(), lr=opt.lr/opt.DLRatio, # betas=(opt.beta1, 0.999), name='optD') # self.optimizerG = AdamPre(self.G.parameters(), lr=opt.lr/opt.GLRatio, # betas=(opt.beta1, 0.999), name='optG') self.optimizerD = optim.Adam(self.D.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizerG = optim.Adam(self.G.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizer_predD = PredOpt(self.D.parameters()) self.optimizer_predG = PredOpt(self.G.parameters()) ################################################################ # Handle special Distributed training modes ################################################################ self.verbose = opt.verbose if opt.distributed: if opt.cuda: self.D = torch.nn.parallel.DistributedDataParallel(self.D) self.G = torch.nn.parallel.DistributedDataParallel(self.G) self.verbose = opt.verbose and dist.get_rank() == 0 else: self.D = torch.nn.parallel.DistributedDataParallelCPU(self.D) self.G = torch.nn.parallel.DistributedDataParallelCPU(self.G) self.verbose = opt.verbose and dist.get_rank() == 0 def train(self, niter, dataset, lookahead_step=1.0, plotLoss=False, n_batches_viz=1): """ Custom DCGAN training function using prediction steps """ real_label = 1 fake_label = 0 fs = [] for epoch in range(niter): for i, data in enumerate(dataset): if self.verbose: c1 = time.clock() ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### self.D.zero_grad() # train on real first real_cpu, _ = data b_size = real_cpu.size(0) input = real_cpu.to(self.device) label = torch.full((b_size, ), real_label, device=self.device) output = self.D(input) errD_real = self.criterion(output, label) errD_real.backward() D_x = output.data.mean() # train with fake noise = torch.randn(b_size, self.nz, 1, 1, device=self.device) # Compute gradient of D w/ predicted G with self.optimizer_predG.lookahead(step=lookahead_step): fake = self.G(noise) label.fill_(fake_label) output = self.D(fake.detach()) errD_fake = self.criterion(output, label) errD_fake.backward() D_G_z1 = output.data.mean() errD = errD_real + errD_fake self.optimizerD.step() self.optimizer_predD.step() ############################ # (2) Update G network: maximize -log(1 - D(G(z))) ########################### self.G.zero_grad() label.fill_(real_label) # Compute gradient of G w/ predicted D with self.optimizer_predD.lookahead(step=lookahead_step): fake = self.G(noise) output = self.D(fake) errG = self.criterion(output, label) errG.backward() D_G_z2 = output.data.mean() self.optimizerG.step() self.optimizer_predG.step() if plotLoss: f = [errD.data[0], errG.data[0]] fs.append(f) if self.verbose: print('[%d/%d][%d/%d] Loss_D:%.4f Loss_G:%.4f D(x)' ': %.4f D(G(z)): %.4f / %.4f' % (epoch, niter, i, len(dataset), errD.data[0], errG.data[0], D_x, D_G_z1, D_G_z2)) print("itr=", epoch, "clock time elapsed=", time.clock() - c1) # if i % self.opt.viz_every == 0 or epoch == niter - 1: # iterViz(self.opt, i, self.G, self.fixed_noise) if self.verbose: # save checkpoints torch.save( self.G.state_dict(), '{0}/netG_epoch_{1}.pth'.format(self.opt.outf, epoch)) torch.save( self.D.state_dict(), '{0}/netD_epoch_{1}.pth'.format(self.opt.outf, epoch))
if opt.cuda: netD.cuda() netG.cuda() criterion.cuda() input, label = input.cuda(), label.cuda() noise, fixed_noise = noise.cuda(), fixed_noise.cuda() fixed_noise = Variable(fixed_noise) # setup optimizer optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) if opt.pred: print('Prediction of G is enabled (see https://openreview.net/forum?id=Skj8Kag0Z¬eId=rkLymJTSf)') optimizer_pred = PredOpt(netG.parameters()) for epoch in range(opt.niter): for i, data in enumerate(dataloader, 0): ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### # train with real netD.zero_grad() real_cpu, _ = data batch_size = real_cpu.size(0) if opt.cuda: real_cpu = real_cpu.cuda() input.resize_as_(real_cpu).copy_(real_cpu) label.resize_(batch_size).fill_(real_label)
def __init__(self, opt, verbose=False): self.opt = opt self.distributed = opt.distributed self.verbose = verbose self.cuda = opt.cuda self.local_rank = opt.local_rank device = 'cpu' if opt.cuda: device = 'cuda' + ':' + str(opt.local_rank) self.device = torch.device(device) ################################################################ # Initializing Generator and Discriminator Networks ################################################################ self.nz = int(opt.nz) ngf = int(opt.ngf) ndf = int(opt.ndf) nc = int(opt.nc) self.G = _netG(self.nz, ngf, nc).to(self.device) self.G.apply(weights_init) if opt.netG != '': self.G.load_state_dict(torch.load(opt.netG)) self.G_losses = torch.load('{}/G_losses.pth'.format(self.opt.outf)) self.D = _netD(ndf, nc).to(self.device) self.D.apply(weights_init) if opt.netD != '': self.D.load_state_dict(torch.load(opt.netD)) self.D_losses = torch.load('{}/D_losses.pth'.format(self.opt.outf)) if self.verbose: print(self.G) print(self.D) ################################################################ # Initialize Loss Function ################################################################ self.criterion = nn.BCELoss() if opt.cuda: self.criterion.cuda(opt.local_rank) ################################################################ # Set Prediction Enabled Adam Optimizer settings ################################################################ self.optimizerD = optim.Adam(self.D.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizerG = optim.Adam(self.G.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizer_predD = PredOpt(self.D.parameters()) self.optimizer_predG = PredOpt(self.G.parameters()) ################################################################ # Handle special Distributed training modes ################################################################ if opt.distributed: if opt.cuda: ids = [ i for i in range(opt.ngpu * opt.local_rank, opt.ngpu + opt.local_rank * opt.ngpu) ] self.D = nn.parallel.DistributedDataParallel( self.D, device_ids=ids).to(self.device) self.G = nn.parallel.DistributedDataParallel( self.G, device_ids=ids).to(self.device) else: if opt.sync_every == 1: self.D = nn.parallel.DistributedDataParallelCPU(self.D) self.G = nn.parallel.DistributedDataParallelCPU(self.G) else: self.D = myd.DistributedDataParallelCPU(self.D) self.G = myd.DistributedDataParallelCPU(self.G) else: if opt.cuda: # torch.cuda.set_device(opt.local_rank) if torch.cuda.device_count() > 1: self.D = nn.parallel.DataParallel(self.D).to(self.device) self.G = nn.parallel.DataParallel(self.G).to(self.device)
class DCGAN(): def __init__(self, opt, verbose=False): self.opt = opt self.distributed = opt.distributed self.verbose = verbose self.cuda = opt.cuda self.local_rank = opt.local_rank device = 'cpu' if opt.cuda: device = 'cuda' + ':' + str(opt.local_rank) self.device = torch.device(device) ################################################################ # Initializing Generator and Discriminator Networks ################################################################ self.nz = int(opt.nz) ngf = int(opt.ngf) ndf = int(opt.ndf) nc = int(opt.nc) self.G = _netG(self.nz, ngf, nc).to(self.device) self.G.apply(weights_init) if opt.netG != '': self.G.load_state_dict(torch.load(opt.netG)) self.G_losses = torch.load('{}/G_losses.pth'.format(self.opt.outf)) self.D = _netD(ndf, nc).to(self.device) self.D.apply(weights_init) if opt.netD != '': self.D.load_state_dict(torch.load(opt.netD)) self.D_losses = torch.load('{}/D_losses.pth'.format(self.opt.outf)) if self.verbose: print(self.G) print(self.D) ################################################################ # Initialize Loss Function ################################################################ self.criterion = nn.BCELoss() if opt.cuda: self.criterion.cuda(opt.local_rank) ################################################################ # Set Prediction Enabled Adam Optimizer settings ################################################################ self.optimizerD = optim.Adam(self.D.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizerG = optim.Adam(self.G.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) self.optimizer_predD = PredOpt(self.D.parameters()) self.optimizer_predG = PredOpt(self.G.parameters()) ################################################################ # Handle special Distributed training modes ################################################################ if opt.distributed: if opt.cuda: ids = [ i for i in range(opt.ngpu * opt.local_rank, opt.ngpu + opt.local_rank * opt.ngpu) ] self.D = nn.parallel.DistributedDataParallel( self.D, device_ids=ids).to(self.device) self.G = nn.parallel.DistributedDataParallel( self.G, device_ids=ids).to(self.device) else: if opt.sync_every == 1: self.D = nn.parallel.DistributedDataParallelCPU(self.D) self.G = nn.parallel.DistributedDataParallelCPU(self.G) else: self.D = myd.DistributedDataParallelCPU(self.D) self.G = myd.DistributedDataParallelCPU(self.G) else: if opt.cuda: # torch.cuda.set_device(opt.local_rank) if torch.cuda.device_count() > 1: self.D = nn.parallel.DataParallel(self.D).to(self.device) self.G = nn.parallel.DataParallel(self.G).to(self.device) def checkpoint(self, epoch): torch.save(self.G.state_dict(), '{0}/netG_epoch_{1}.pth'.format(self.opt.outf, epoch)) torch.save(self.D.state_dict(), '{0}/netD_epoch_{1}.pth'.format(self.opt.outf, epoch)) torch.save(self.G_losses, '{}/G_losses.pth'.format(self.opt.outf)) torch.save(self.D_losses, '{}/D_losses.pth'.format(self.opt.outf)) torch.save(self.Dxs, '{}/D_xs.pth'.format(self.opt.outf)) torch.save(self.DGz1s, '{}/D_G_z1s.pth'.format(self.opt.outf)) torch.save(self.DGz2s, '{}/D_G_z2s.pth'.format(self.opt.outf)) def train(self, niter, dataset, gpred_step=1.0, dpred_step=0.0, sync_every=1, viz_every=10): # n_batches_viz=10): """ Custom DCGAN training function using prediction steps """ real_label = 1 fake_label = 0 self.D_losses = [] self.G_losses = [] self.Dxs = [] self.DGz1s = [] self.DGz2s = [] img_list = [] # fixed_noise = torch.randn(n_batches_viz, self.nz, 1, 1) # if self.cuda: # fixed_noise.cuda(self.local_rank, non_blocking=True) itr = 0 for epoch in range(niter): c0 = time.time() for i, data in enumerate(dataset): c1 = time.time() ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### self.D.zero_grad() # train on real first real_cpu, _ = data b_size = real_cpu.size(0) input = real_cpu if self.cuda: input.cuda(self.local_rank, non_blocking=True) label = torch.full((b_size, ), real_label, device=self.device) output = self.D(input) errD_real = self.criterion(output, label) errD_real.backward() D_x = output.data.mean() # train with fake noise = torch.randn(b_size, self.nz, 1, 1, device=self.device) # Compute gradient of D w/ predicted G with self.optimizer_predG.lookahead(step=gpred_step): fake = self.G(noise) label.fill_(fake_label) output = self.D(fake.detach()) errD_fake = self.criterion(output, label) errD_fake.backward() D_G_z1 = output.data.mean() errD = errD_real + errD_fake self.optimizerD.step() self.optimizer_predD.step() ############################ # (2) Update G network: maximize -log(1 - D(G(z))) ########################### self.G.zero_grad() label.fill_(real_label) # Compute gradient of G w/ predicted D with self.optimizer_predD.lookahead(step=dpred_step): fake = self.G(noise) output = self.D(fake) errG = self.criterion(output, label) errG.backward() D_G_z2 = output.data.mean() self.optimizerG.step() self.optimizer_predG.step() self.G_losses.append(errG.data) self.D_losses.append(errD.data) self.Dxs.append(D_x) self.DGz1s.append(D_G_z1) self.DGz2s.append(D_G_z2) sync_print = '' if sync_every != 1 and itr % sync_every == 0: sync_print = '\t -- synced at iteration ' + str(itr) self.D.sync_parameters() self.G.sync_parameters() if self.verbose: print('[{}/{}][{}/{}] {:0.2f} secs, Loss_D:{:0.4f} Loss_G:' '{:0.4f} D(x): {:0.4f} D(G(z)): {:0.4f} / {:0.4f}{}'. format(epoch, niter, i, len(dataset), time.time() - c1, errD.data, errG.data, D_x, D_G_z1, D_G_z2, sync_print)) if itr % viz_every == 0: self.checkpoint(epoch) itr += 1 if sync_every != 1: if self.verbose: print('Synchronizing Parameters at epoch:', i) self.D.sync_parameters() self.G.sync_parameters() if self.verbose: self.checkpoint(epoch) if self.verbose: print("Finished epoch in {:0.2f} seconds".format(time.time() - c0)) return (self.G_losses, self.D_losses, self.Dxs, self.DGz1s, self.DGz2s, img_list)