bceloss = bceloss.cuda() lib.init_params(gen) lib.init_params(disc) optim_g = torch.optim.Adam(gen.parameters(), lr=1e-3) optim_d = torch.optim.Adam(disc.parameters(), lr=1e-3) for epoch in range(epochs): gen.train() disc.train() for (x, _) in TrainDataLoader: #x = x.view(-1, 28*28) num_data = x.shape[0] noise = lib.sample_noise(num_data, num_noise) zeros = torch.zeros(num_data, 1) ones = torch.ones(num_data, 1) if torch.cuda.is_available(): x = x.cuda() noise = noise.cuda() zeros = zeros.cuda() ones = ones.cuda() x_g = gen(noise) ### Discriminator train optim_d.zero_grad() disc.zero_grad()
lib.init_params(gen) lib.init_params(disc) optim_g = torch.optim.Adam(gen.parameters(), lr=1e-3, betas=(0, 0.9)) optim_d = torch.optim.Adam(disc.parameters(), lr=1e-3, betas=(0, 0.9)) for epoch in range(epochs): gen.train() disc.train() for i in range(1): for _train_data in train_data: x = _train_data.view(-1, 1, 128, 128) num_data = x.shape[0] noise = lib.sample_noise(num_data, num_noise).to(device) x_g = gen(noise) ### Discriminator train optim_d.zero_grad() ## Regularization Term eps = torch.rand(1).item() x_hat = (x.detach().clone() * eps + x_g.detach().clone() * (1 - eps)).requires_grad_(True) loss_xhat = disc(x_hat) fake = torch.ones(loss_xhat.shape[0], 1).requires_grad_(False).to(device)
def train(self, data_loader, task, generator, classifier): self.cur_task = task running_loss = 0.0 for i, data in enumerate(data_loader): x, y = data x = x.to(self.device) y = y.to(self.device) self.G.data.fill_(0.0) # Compute gradient w.r.t. past tasks with episodic memory ### !!!!! if self.cur_task > 0: for k in range(0, self.cur_task): self.zero_grad() noise = lib.sample_noise(self.mem_size, self.num_noise).to(self.device) g_image = generator(noise).to(self.device) g_label = classifier(g_image).max(dim=1)[1] g_pred = self.net(g_image) loss = self.criterion(g_pred, g_label) loss.backward() # Copy parameters into Matrix "G" j = 0 for params in self.parameters(): if params is not None: if j == 0: stpt = 0 else: stpt = sum(self.grad_numels[:j]) endpt = sum(self.grad_numels[:j + 1]) self.G[stpt:endpt, k].data.copy_(params.grad.data.view(-1)) j += 1 self.zero_grad() self.optim.zero_grad() # Compute gradient w.r.t. current continuum pred = self.net(x) # pred[:, : self.cur_task * 10].data.fill_(-10e10) # pred[:, (self.cur_task+1) * 10:].data.fill_(-10e10) # pred = pred[:, self.cur_task*10: (self.cur_task+1)*10] loss = self.criterion(pred, y) loss.backward() running_loss += loss.item() if i % 100 == 99: msg = '[%d\t%d] AVG. loss: %.3f\n' % ( task + 1, i + 1, running_loss / 100) #(i*5)) print(msg) #self.log_file.write(msg) running_loss = 0.0 if self.cur_task > 0: grad = [] j = 0 for params in self.parameters(): if params is not None: if j == 0: stpt = 0 else: stpt = sum(self.grad_numels[:j]) endpt = sum(self.grad_numels[:j + 1]) self.G[stpt:endpt, self.cur_task].data.copy_(params.grad.view(-1)) j += 1 # Solve Quadratic Problem dotprod = torch.mm(self.G[:, self.cur_task].unsqueeze(0), self.G[:, :self.cur_task + 1]) # projection if (dotprod < 0).sum() > 0: if i % 100 == 99: print("projection") mem_grad_np = self.G[:, :self.cur_task + 1].cpu().t().double().numpy() curtask_grad_np = self.G[:, self.cur_task].unsqueeze( 1).cpu().contiguous().view(-1).double().numpy() t = mem_grad_np.shape[0] P = np.dot(mem_grad_np, mem_grad_np.transpose()) P = 0.5 * (P + P.transpose()) + np.eye(t) * self.eps q = np.dot(mem_grad_np, curtask_grad_np) * (-1) G = np.eye(t) h = np.zeros(t) + self.margin v = quadprog.solve_qp(P, q, G, h)[0] x = np.dot(v, mem_grad_np) + curtask_grad_np newgrad = torch.Tensor(x).view(-1, ) # Copy gradients into params j = 0 for params in self.parameters(): if params is not None: if j == 0: stpt = 0 else: stpt = sum(self.grad_numels[:j]) endpt = sum(self.grad_numels[:j + 1]) params.grad.data.copy_( newgrad[stpt:endpt].contiguous().view( params.grad.data.size())) j += 1 self.optim.step()
def train(**kwargs): TrainDataLoaders = kwargs['TrainDataLoaders'] TestDataLoaders = kwargs['TestDataLoaders'] batch_size = kwargs['batch_size'] num_noise = kwargs['num_noise'] cur_task = kwargs['cur_task'] gen = kwargs['gen'] disc = kwargs['disc'] solver = kwargs['solver'] pre_gen = kwargs['pre_gen'] pre_solver = kwargs['pre_solver'] ratio = kwargs['ratio'] epochs = kwargs['epochs'] assert (ratio >= 0 or ratio <= 1) ld = 10 optim_g = torch.optim.Adam(gen.parameters(), lr=0.001, betas=(0, 0.9)) optim_d = torch.optim.Adam(disc.parameters(), lr=0.001, betas=(0, 0.9)) optim_s = torch.optim.Adam(solver.parameters(), lr=0.001) TrainDataLoader = TrainDataLoaders[cur_task] # Generator Training for epoch in range(epochs): gen.train() disc.train() for i, (x, y) in enumerate(TrainDataLoader): x = x.view(-1, 28 * 28) num_data = x.shape[0] noise = lib.sample_noise(num_data, num_noise) if torch.cuda.is_available(): x = x.cuda() noise = noise.cuda() if pre_gen is not None: with torch.no_grad(): # append generated image & label from previous scholar x_g = pre_gen(lib.sample_noise(batch_size, num_noise)) ''' gimg_min = gen_image.min(dim=1, keepdim=True)[0].min(dim=2, keepdim=True)[0] gen_image = ((gen_image - gimg_min) * 256) ''' x = torch.cat((x, x_g)) perm = torch.randperm(x.shape[0])[:num_data] x = x[perm] #x = x.unsqueeze(1) ### Discriminator train optim_d.zero_grad() x_g = gen(noise) ## Regularization term eps = torch.rand(1).item() x_hat = (x.detach().clone() * eps + x_g.detach().clone() * (1 - eps)).requires_grad_(True) loss_xhat = disc(x_hat) fake = torch.ones(loss_xhat.shape[0], 1).requires_grad_(False) if torch.cuda.is_available(): fake = fake.cuda() gradients = torch.autograd.grad(outputs=loss_xhat, inputs=x_hat, grad_outputs=fake, create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.view(gradients.shape[0], -1) gp = ((gradients.norm(2, dim=1) - 1)**2).mean() * ld p_real = disc(x) p_fake = disc(x_g.detach()) loss_d = torch.mean(p_fake) - torch.mean(p_real) + gp loss_d.backward() optim_d.step() #if i % 5 == 4: ### Generator Training optim_g.zero_grad() p_fake = disc(x_g) loss_g = -torch.mean(p_fake) loss_g.backward() optim_g.step() print("[Epoch %d/%d] [D loss: %f] [G loss: %f]" % (epoch + 1, epochs, loss_d.item(), loss_g.item())) if epoch % 10 == 9: gen_image = gen(lib.sample_noise(24, num_noise)).view(24, 1, 28, 28) lib.imshow_grid(gen_image) # train solver for image, label in TrainDataLoader: if torch.cuda.is_available(): image = image.cuda() label = label.cuda() output = solver(image) loss = celoss(output, label) * ratio if pre_solver is not None: noise = lib.sample_noise(batch_size, num_noise) g_image = pre_gen(noise) g_label = pre_solver(g_image).max(dim=1)[1] g_output = solver(g_image) loss += celoss(g_output, g_label) * (1 - ratio) loss.backward() optim_s.step()
lib.init_params(disc) TrainDataLoader = TrainDataLoaders[t] optim_g = torch.optim.Adam(gen.parameters(), lr=0.001, betas=(0, 0.9)) optim_d = torch.optim.Adam(disc.parameters(), lr=0.001, betas=(0, 0.9)) # Generator Training for epoch in range(gen_epochs): gen.train() disc.train() for i, (x, _) in enumerate(TrainDataLoader): x = x.to(device) num_data = x.shape[0] noise = lib.sample_noise(num_data, num_noise, device).to(device) if pre_gen is not None: with torch.no_grad(): # append generated image & label from previous scholar datapart = int(num_data * ratio) perm = torch.randperm(num_data)[:datapart] x = x[perm] x_g = pre_gen(lib.sample_noise(num_data, num_noise, device)) perm = torch.randperm(num_data)[:num_data - datapart] x_g = x_g[perm] x = torch.cat((x, x_g))
def train(**kwargs): TrainDataLoaders = kwargs['TrainDataLoaders'] TestDataLoaders = kwargs['TestDataLoaders'] batch_size = kwargs['batch_size'] num_noise = kwargs['num_noise'] cur_task = kwargs['cur_task'] gen = kwargs['gen'] disc = kwargs['disc'] solver = kwargs['solver'] pre_gen = kwargs['pre_gen'] pre_solver = kwargs['pre_solver'] ratio = kwargs['ratio'] epochs = kwargs['epochs'] assert (ratio >=0 or ratio <= 1) bceloss = torch.nn.BCELoss() celoss = torch.nn.CrossEntropyLoss() gen_optim = torch.optim.Adam(gen.parameters(), lr=0.001) disc_optim = torch.optim.Adam(disc.parameters(), lr=0.001) solver_optim = torch.optim.Adam(solver.parameters(), lr=0.001) train_dataloader = TrainDataLoaders[cur_task] # GAN Training for epoch in range(epochs): for image, label in train_dataloader: num_images = image.shape[0] if torch.cuda.is_available(): image = image.cuda() #label = label.cuda() if pre_gen is not None: with torch.no_grad(): # append generated image & label from previous scholar gen_image = pre_gen(lib.sample_noise(batch_size, num_noise)).squeeze() gimg_min = gen_image.min(dim=1, keepdim=True)[0].min(dim=2, keepdim=True)[0] gen_image = ((gen_image - gimg_min) * 256) gen_label = pre_solver(gen_image).max(dim=1)[1] image = torch.cat((image, gen_image)) label = torch.cat((label, gen_label)) perm = torch.randperm(image.shape[0])[:num_images] image = image[perm] image = image.unsqueeze(1) ### Discriminator Training disc_optim.zero_grad() p_real = disc(image) p_fake = disc(gen(lib.sample_noise(image.shape[0], num_noise))) ones = torch.ones_like(p_real) zeros = torch.zeros_like(p_real) if torch.cuda.is_available(): ones = ones.cuda() zeros = zeros.cuda() loss_d = bceloss(p_real, ones) + bceloss(p_fake, zeros) loss_d.backward() disc_optim.step() # Clipping weights for params in disc.parameters(): params = torch.clamp(params, -0.01, 0.01) ### Generator Training gen_optim.zero_grad() p_fake = disc(gen(lib.sample_noise(batch_size, num_noise))) ones = torch.ones_like(p_fake) if torch.cuda.is_available(): ones = ones.cuda() loss_g = bceloss(p_fake, ones) loss_g.backward() gen_optim.step() if epoch % 50 == 49: p_real, p_fake = lib.gan_evaluate(batch_size = batch_size, num_noise = num_noise, cur_task = cur_task, gen = gen, disc = disc, TestDataLoaders = TestDataLoaders) gen_image = gen(lib.sample_noise(batch_size, num_noise)) print("(Epoch {}/{}) p_real: {} | p_fake: {}\n".format(epoch, epochs, p_real, p_fake)) lib.imshow_grid(gen_image) # train solver for image, label in train_dataloader: if torch.cuda.is_available(): image = image.cuda() label = label.cuda() output = solver(image) loss = celoss(output, label) * ratio if pre_solver is not None: noise = lib.sample_noise(batch_size, num_noise) g_image = pre_gen(noise) g_label = pre_solver(g_image).max(dim=1)[1] g_output = solver(g_image) loss += celoss(g_output, g_label) * (1 - ratio) loss.backward() solver_optim.step()
### optimizer optim_g = torch.optim.Adam(gen.parameters(), lr=1e-3, betas=(0, 0.9)) optim_d = torch.optim.Adam(disc.parameters(), lr=1e-3, betas=(0, 0.9)) optim_s = torch.optim.Adam(solver.parameters(), lr=1e-3) for epoch in range(epochs): gen.train() disc.train() ### WGAN_GP Learning for i in range(10): for _train_data in train_data: x = _train_data.view(-1, 1, 128, 128).to(device) num_data = x.shape[0] noise = lib.sample_noise(num_data, num_noise).to(device) x_g = gen(noise) ### Discriminator train optim_d.zero_grad() ## Regularization Term eps = torch.rand(1).item() x_hat = (x.detach().clone() * eps + x_g.detach().clone() * (1 - eps)).requires_grad_(True) loss_xhat = disc(x_hat) fake = torch.ones(loss_xhat.shape[0], 1).requires_grad_(False).to(device) gradients = torch.autograd.grad( outputs = loss_xhat,