def train(train_loader, model, reglog, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # freeze also batch norm layers model.eval() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) #adjust learning rate learning_rate_decay(optimizer, len(train_loader) * epoch + i, args.lr) target = target.cuda(async=True) input_var = torch.autograd.Variable(input.cuda()) target_var = torch.autograd.Variable(target) # compute output output = forward(input_var, model, reglog.conv) output = reglog(output) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.data[0], input.size(0)) top1.update(prec1[0], input.size(0)) top5.update(prec5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if args.verbose and i % 100 == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5))
def prepare_training(self): with self.graph.as_default(): # Optimizer self.global_step = tf.get_variable( name='global_step', dtype=tf.int64, shape=[], trainable=False, initializer=tf.zeros_initializer) self.learning_rate = tf.convert_to_tensor( self._config.train.learning_rate, dtype=tf.float32) if self._config.train.optimizer == 'adam': self._optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) elif self._config.train.optimizer == 'adam_decay': self.learning_rate *= learning_rate_decay( self._config, self.global_step) self._optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) elif self._config.train.optimizer == 'sgd': self._optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.learning_rate) elif self._config.train.optimizer == 'mom': self._optimizer = tf.train.MomentumOptimizer( self.learning_rate, momentum=0.9) # Uniform scaling initializer. self._initializer = init_ops.variance_scaling_initializer( scale=1.0, mode='fan_avg', distribution='uniform')
def train(model, train_loader, test_loader, features, cfg): global_step = 0 lr = 0 num_batch = int(features['num_samples'] / cfg.batch_size) for epoch in range(cfg.epoch): losses = 0 acces = 0 for step, (batch_xs, batch_ys) in enumerate(tqdm(train_loader, total=num_batch, ncols=50, leave=False, unit='b')): if len(batch_ys.shape) <= 1: #only assume shape is (bs,) one_hot = torch.FloatTensor(cfg.batch_size, features['num_classes']).zero_() batch_ys = batch_ys.unsqueeze_(1) one_hot.scatter_(1, batch_ys, 1.) batch_ys = one_hot batch_xs, batch_ys = Variable(batch_xs), Variable(batch_ys) if cfg.use_cuda: batch_xs, batch_ys = batch_xs.cuda(), batch_ys.cuda() lr, lr_decay_finished = learning_rate_decay(global_step, lr, cfg) if not lr_decay_finished: optimizer = optim.Adam(model.parameters(), lr=lr) out, reconstruction_2d = model(batch_xs, batch_ys) classification_loss, reconstruction_loss = model.loss(batch_xs, out, reconstruction_2d, batch_ys) loss = 0.5*(classification_loss + reconstruction_loss) model.zero_grad() loss.backward() optimizer.step() losses = losses + loss.cpu().data.numpy()[0] global_step += 1 if epoch % 5 == 0: for i, (batch_xs, batch_ys) in enumerate(test_loader): if len(batch_ys.shape) <= 1: #only assume shape is (bs,) one_hot = torch.FloatTensor(cfg.batch_size, features['num_classes']).zero_() batch_ys = batch_ys.unsqueeze_(1) one_hot.scatter_(1, batch_ys, 1.) batch_ys = one_hot batch_xs, batch_ys = Variable(batch_xs), Variable(batch_ys) if cfg.use_cuda: batch_xs, batch_ys = batch_xs.cuda(), batch_ys.cuda() out, _ = model(batch_xs, batch_ys) acc = model.classification_loss(out, batch_ys, 1) acces = acces + acc.cpu().data.numpy()[0] _, reconstruction_2d = model(batch_xs, batch_ys) save_image(cfg, epoch, global_step, reconstruction_2d, batch_xs, features, idx=40) print('epoch is %d, training loss is %.4f, test acc is %.4f' % (epoch, losses, acces / features['num_test_samples']))
def update(self): self.global_step = tf.get_variable('global_step', initializer=0, dtype=tf.int32, trainable=False) self.generator_lr = learning_rate_decay(hp.G_LR, global_step=self.global_step) # self.discriminator_lr = learning_rate_decay(hp.D_LR, global_step=self.global_step) # Generator loss # self.reconstruction_loss = tf.reduce_mean(tf.abs(self.ori_out - self.ori_feat)) # ori_out 生成器生成出来的sp # self.cycle_loss = tf.reduce_mean(tf.abs(self.cycle_ori_out - self.ori_feat)) # 以上两个是 GAN 才有的 self.construction_loss = tf.reduce_mean(tf.abs(self.aim_out - self.aim_mel)) # 另外加的,转换损失 # self.ori_kl_loss = - 0.5 * tf.reduce_sum(1 + self.ori_log_var - tf.pow(self.ori_mu, 2) - tf.exp(self.ori_log_var)) # 没有自己变自己的需求了 self.aim_kl_loss = - 0.5 * tf.reduce_sum(1 + self.aim_log_var - tf.pow(self.aim_mu, 2) - tf.exp(self.aim_log_var)) # self.cycle_kl_loss = - 0.5 * tf.reduce_sum(1 + self.cycle_log_var - tf.pow(self.cycle_mu, 2) - tf.exp(self.cycle_log_var)) self.kl_loss_weight = control_weight(self.global_step) # self.kl_loss = self.kl_loss_weight * (self.ori_kl_loss + self.aim_kl_loss + self.cycle_kl_loss) self.kl_loss = self.kl_loss_weight * (self.aim_kl_loss) # kl_loss 修改版本(去掉了另外两个没用上的 loss) # self.GAN_G_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.t_G, logits=self.predict_fake_P)) # self.G_loss = self.reconstruction_loss + self.cycle_loss + self.kl_loss + self.GAN_G_loss # 修改版,现在 G 只有一个损失,就是 自身 VAE 的 kl_loss 损失 self.G_loss = self.kl_loss + self.construction_loss # 修改,加上self.construction_loss # Variables trainable_variables = tf.trainable_variables() # 这个还真么看懂什么作用;再请教!!! self.G_vars = [var for var in trainable_variables if 'generator' in var.name] # self.D_vars = [var for var in trainable_variables if 'discriminator' in var.name] # Optimizer self.G_optimizer = tf.train.AdamOptimizer(self.generator_lr) # self.D_optimizer = tf.train.AdamOptimizer(self.discriminator_lr) # Generator Gradient Clipping And Update ; G 梯度裁剪 和 更新 self.G_clipped = [] self.G_gvs = self.G_optimizer.compute_gradients(self.G_loss, var_list=self.G_vars) """ computer_gradients(loss, val_list) val_list是进行求偏导的变量的列表,默认为graph中收集的变量列表 这里的操作是计算出各个变量的偏导数(梯度),是为了防止梯度爆炸和梯度消失。通过对gradient的修正,来进行避免。 """ for grad, var in self.G_gvs: grad = tf.clip_by_norm(grad, 5.) """ tf.clip_by_norm(t,clip_norm,axes=None,name=None) 指对梯度进行裁剪,通过控制梯度的最大范式,防止梯度爆炸的问题,是一种比较常用的梯度规约的方式。 """ self.G_clipped.append((grad, var)) self.G_train_op = self.G_optimizer.apply_gradients(self.G_clipped, global_step=self.global_step) """
def build_training_scheme(self): ''' hp.update_weights: list of strings of regular expressions used to match scope prefixes of variables with tf.get_collection. Only these will be updated by the graph's train_op: others will be frozen in training. TODO: this comment is now out of place... ''' hp = self.hp self.global_step = tf.Variable(0, name='global_step', trainable=False) if hp.decay_lr: self.lr = learning_rate_decay(hp.lr, self.global_step) else: self.lr = hp.lr self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=hp.beta1, beta2=hp.beta2, epsilon=hp.epsilon) tf.summary.scalar("lr", self.lr) if self.hp.update_weights: train_variables = filter_variables_for_update( self.hp.update_weights) print('Subset of trainable variables chosen for finetuning.' ) ## TODO: add to logging! print('Variables not in this list will remain frozen:') for variable in train_variables: print(variable.name) else: train_variables = None ## default value -- everything included in compute_gradients ## gradient clipping self.gvs = self.optimizer.compute_gradients( self.loss, var_list=train_variables ) ## var_list: Optional list or tuple of tf.Variable to update to minimize loss self.clipped = [] for grad, var in self.gvs: grad = tf.clip_by_value(grad, -1., 1.) self.clipped.append((grad, var)) self.train_op = self.optimizer.apply_gradients( self.clipped, global_step=self.global_step) # Summary self.merged = tf.summary.merge_all()
def optimize(self): """ Optimize the learning rate. """ # global step self.global_step = tf.Variable(0, name='global_step', trainable=False) # learning rate decay self.learning_rate = learning_rate_decay(global_step=self.global_step) # optimizer self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) # Gradient clipping gradients, variables = zip( *self.optimizer.compute_gradients(self.loss)) self.gradients = gradients clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) self.opt_train = self.optimizer.apply_gradients( zip(clipped_gradients, variables), global_step=self.global_step)
### create dataset train_dataset = datasets.MultiFramesDataset(opts, "train") ### start training while model.epoch < opts.epoch_max: model.epoch += 1 ### re-generate train data loader for every epoch data_loader = utils.create_data_loader(train_dataset, opts, "train") ### update learning rate current_lr = utils.learning_rate_decay(opts, model.epoch) for param_group in optimizer.param_groups: param_group['lr'] = current_lr ## submodule flow_warping = Resample2d().to(device) downsampler = nn.AvgPool2d((2, 2), stride=2).to(device) ### criterion and loss recorder if opts.loss == 'L2': criterion = nn.MSELoss(size_average=True) elif opts.loss == 'L1': criterion = nn.L1Loss(size_average=True) else:
if os.path.exists(args.continual): state_dict = torch.load(args.continual) initial_iter = state_dict['iter'] model.encoder.load_state_dict(state_dict['encoder']) model.decoder.load_state_dict(state_dict['decoder']) optimizer.load_state_dict(state_dict['optimizer']) # log writer writer = SummaryWriter(log_dir=str(log_dir)) # for maximum iteration model.to(device) for i in tqdm(range(initial_iter, args.max_iter)): # adjust learning rate lr = learning_rate_decay(args.learning_rate, args.learning_rate_decay, i) for group in optimizer.param_groups: group['lr'] = lr # get images content_images = next(content_iter).to(device) style_images = next(style_iter).to(device) # calculate loss g, loss_content, loss_style = model(content_images, style_images) loss_content = args.content_weight * loss_content loss_style = args.style_weight * loss_style loss = loss_content + loss_style # optimize the network optimizer.zero_grad()
train_dataset = datasets_multiple.MultiFramesDataset( opts, "rain_removal", "train") train_haze_dataset = datasets_multiple_haze.MultiFramesHazeDataset( opts, "rain_removal_haze", "train") ### start training while multi_model_res.epoch < opts.epoch_max: multi_model_res.epoch += 1 ### re-generate train data loader for every epoch data_loader = utils.create_data_loader(train_dataset, opts, "train") data_haze_loader = utils.create_data_loader(train_haze_dataset, opts, "train") ### update learning rate current_lr = utils.learning_rate_decay(opts, multi_model_res.epoch) for param_group in optimizer.param_groups: param_group['lr'] = current_lr if opts.loss == 'L2': criterion = nn.MSELoss(size_average=True) elif opts.loss == 'L1': criterion = nn.L1Loss(size_average=True) else: raise Exception("Unsupported criterion %s" % opts.loss) criterion_ssim = SSIM() criterion_mse = nn.MSELoss(size_average=True) criterion_l1 = nn.L1Loss(size_average=True)
def build(self): if self.training: # Training Scheme self.learning_rate_EG = learning_rate_decay( self.learning_rate_EG, self.global_step) self.optimizer_EG = tf.train.AdamOptimizer( learning_rate=self.learning_rate_EG) tf.summary.scalar("learning_rate_EG", self.learning_rate_EG) self.learning_rate_D = learning_rate_decay(self.learning_rate_D, self.global_step) self.optimizer_D = tf.train.AdamOptimizer( learning_rate=self.learning_rate_D) tf.summary.scalar("learning_rate_D", self.learning_rate_D) if not self.fine_tune: (self.idx, self.x, self.y, self.tx, self.ty) = data() else: self.x = tf.placeholder(tf.float32, [self.K] + list(self.img_size), name='x') self.y = tf.placeholder(tf.float32, [self.K] + list(self.img_size), name='y') self.tx = tf.placeholder(tf.float32, list(self.img_size), name='tx') self.ty = tf.placeholder(tf.float32, list(self.img_size), name='ty') # Embedder # Calculate average encoding vector for video and AdaIn params input self.e_hat, self.psi_hat = self.Embedder(self.x, self.y, sn=True, reuse=False) if not self.fine_tune: # Generator # Generate frame using landmarks from frame t self.x_hat = self.Generator(self.ty, psi_Pe=self.psi_hat, sn=True, reuse=False) # Discriminator # real score for fake image self.r_x_hat, self.D_act_hat = self.Discriminator(self.x_hat, self.ty, i=self.idx, e_new=None, sn=True, reuse=False) # real score for real image self.r_x, self.D_act = self.Discriminator(self.tx, self.ty, i=self.idx, e_new=None, sn=True, reuse=True) else: x, y, _, _ = get_frame_data(self.frames) embedder_var_list = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'embedder') embedder_saver = tf.train.Saver(var_list=embedder_var_list) embedder_saver.restore(self.sess, tf.train.latest_checkpoint(self.logdir)) self.sess.run(tf.global_variables_initializer) e_hat, psi_hat = self.sess.run([self.e_hat, self.psi_hat], feeddict={ self.x: x, self.y: y }) # Generator # Generate frame using landmarks from frame t self.x_hat = self.Generator(self.ty, psi_Pe=None, psi_hat_init=psi_hat, sn=True, reuse=False) # Discriminator # real score for fake image self.r_x_hat, self.D_act_hat = self.Discriminator(self.x_hat, self.ty, i=None, e_new=e_hat, sn=True, reuse=False) # real score for real image self.r_x, self.D_act = self.Discriminator(self.tx, self.ty, i=None, e_new=e_hat, sn=True, reuse=True) self.loss_CNT = self.loss_cnt(self.tx, self.x_hat) self.loss_ADV = self.loss_adv(self.r_x_hat, self.D_act, self.D_act_hat) if not self.fine_tune: self.loss_MCH = self.loss_mch( self.e_hat, tf.squeeze(tf.nn.embedding_lookup(self.W, self.idx), axis=1)) self.loss_EG = self.loss_CNT + self.loss_ADV + self.loss_MCH else: self.loss_EG = self.loss_CNT + self.loss_ADV self.loss_DSC = self.loss_dsc(self.r_x, self.r_x_hat) tf.summary.scalar("loss_CNT", self.loss_CNT) tf.summary.scalar("loss_ADV", self.loss_ADV) if not self.fine_tune: tf.summary.scalar("loss_MCH", self.loss_MCH) tf.summary.scalar("loss_EG", self.loss_EG) tf.summary.scalar("loss_DSC", self.loss_DSC) tf.summary.scalar("loss_r_x_hat", tf.reduce_mean(self.r_x_hat)) tf.summary.scalar("loss_r_x", tf.reduce_mean(self.r_x)) # Embedder & Generator Optimization EG_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'generator') if not self.fine_tune: EG_var_list += tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'embedder') self.grads_EG = self.optimizer_EG.compute_gradients( self.loss_EG, var_list=EG_var_list) ## gradient clipping self.clipped_EG = [ (tf.clip_by_value(grad, -1., 1.) if not grad == None else grad, var) for grad, var in self.grads_EG ] self.train_EG = self.optimizer_EG.apply_gradients( self.clipped_EG) #, global_step=self.global_step) # Discriminator Optimization D_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'discriminator') self.grads_D = self.optimizer_D.compute_gradients( self.loss_DSC, var_list=D_var_list) ## gradient clipping self.clipped_D = [ (tf.clip_by_value(grad, -1., 1.) if not grad == None else grad, var) for grad, var in self.grads_D ] # Updating Global step only during EG optimization as two optimization happens during trainig, so second incrementation of global step. self.train_D = self.optimizer_D.apply_gradients( self.clipped_D) #, global_step=self.global_step) # Global step increment self.global_step_increment = tf.assign_add( self.global_step, 1, name="global_step_increment") tf.summary.image('Generator/1/x', self.tx) tf.summary.image('Generator/2/y', self.ty) tf.summary.image('Generator/3/x_hat', self.x_hat) # Summary self.merged = tf.summary.merge_all() else: self.ty = tf.placeholder(tf.float32, [None] + list(self.img_size), name='ty') if not self.fine_tune: self.x = tf.placeholder(tf.float32, [None] + list(self.img_size), name='x') self.y = tf.placeholder(tf.float32, [None] + list(self.img_size), name='y') # Embedder # Calculate average encoding vector for video and AdaIn params input self.e_hat, self.psi_hat = self.Embedder(self.x, self.y, sn=True, reuse=False) # Generator # Generate frame using landmarks from frame t self.x_hat = self.Generator(self.ty, psi_Pe=self.psi_hat, sn=True, reuse=False) else: # Generator # Generate frame using landmarks from frame t self.x_hat = self.Generator(self.ty, psi_Pe=None, psi_hat_init=None, sn=True, reuse=False)
vgg = vgg(vgg_model) vgg.eval() fusion_model.train() three_dim_model.train() FlowNet.train() train_dataset = datasets_multiple.MultiFramesDataset(opts, "train") loss_fn = torch.nn.L1Loss(reduce=True, size_average=True) while three_dim_model.epoch < opts.epoch_max: three_dim_model.epoch += 1 data_loader = utils.create_data_loader(train_dataset, opts, "train") current_lr = utils.learning_rate_decay(opts, three_dim_model.epoch) for param_group in optimizer.param_groups: param_group['lr'] = current_lr for param_group in optimizer_flow.param_groups: param_group['lr'] = current_lr * 0.001 error_last = 1e8 ts = datetime.now() for iteration, batch in enumerate(data_loader, 1): total_iter = (three_dim_model.epoch - 1) * opts.train_epoch_size + iteration cross_num = 1 frame_i = []
def train(args): # set the logger logger = Logger('./logs') # GPU enabling if (args.gpu != None): use_cuda = True dtype = torch.cuda.FloatTensor torch.cuda.set_device(args.gpu) print("Current device: %s" % torch.cuda.get_device_name(args.gpu)) # define networks g_AtoB = Generator().type(dtype) g_BtoA = Generator().type(dtype) d_A = Discriminator().type(dtype) d_B = Discriminator().type(dtype) # optimizers optimizer_generators = Adam( list(g_AtoB.parameters()) + list(g_BtoA.parameters()), INITIAL_LR) optimizer_d_A = Adam(d_A.parameters(), INITIAL_LR) optimizer_d_B = Adam(d_B.parameters(), INITIAL_LR) # loss criterion criterion_mse = torch.nn.MSELoss() criterion_l1 = torch.nn.L1Loss() # get training data dataset_transform = transforms.Compose([ transforms.Resize(int(IMAGE_SIZE * 1), Image.BICUBIC), # scale shortest side to image_size transforms.RandomCrop( (IMAGE_SIZE, IMAGE_SIZE)), # random center image_size out transforms.ToTensor(), # turn image from [0-255] to [0-1] transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # normalize ]) dataloader = DataLoader(ImgPairDataset(args.dataroot, dataset_transform, 'train'), batch_size=BATCH_SIZE, shuffle=True) # get some test data to display periodically test_data_A = torch.tensor([]).type(dtype) test_data_B = torch.tensor([]).type(dtype) for i in range(NUM_TEST_SAMPLES): imgA = ImgPairDataset(args.dataroot, dataset_transform, 'test')[i]['A'].type(dtype).unsqueeze(0) imgB = ImgPairDataset(args.dataroot, dataset_transform, 'test')[i]['B'].type(dtype).unsqueeze(0) test_data_A = torch.cat((test_data_A, imgA), dim=0) test_data_B = torch.cat((test_data_B, imgB), dim=0) fileStrA = 'visualization/test_%d/%s/' % (i, 'B_inStyleofA') fileStrB = 'visualization/test_%d/%s/' % (i, 'A_inStyleofB') if not os.path.exists(fileStrA): os.makedirs(fileStrA) if not os.path.exists(fileStrB): os.makedirs(fileStrB) fileStrA = 'visualization/test_original_%s_%04d.png' % ('A', i) fileStrB = 'visualization/test_original_%s_%04d.png' % ('B', i) utils.save_image( fileStrA, ImgPairDataset(args.dataroot, dataset_transform, 'test')[i]['A'].data) utils.save_image( fileStrB, ImgPairDataset(args.dataroot, dataset_transform, 'test')[i]['B'].data) # replay buffers replayBufferA = utils.ReplayBuffer(50) replayBufferB = utils.ReplayBuffer(50) # training loop step = 0 for e in range(EPOCHS): startTime = time.time() for idx, batch in enumerate(dataloader): real_A = batch['A'].type(dtype) real_B = batch['B'].type(dtype) # some examples seem to have only 1 color channel instead of 3 if (real_A.shape[1] != 3): continue if (real_B.shape[1] != 3): continue # ----------------- # train generators # ----------------- optimizer_generators.zero_grad() utils.learning_rate_decay(INITIAL_LR, e, EPOCHS, optimizer_generators) # GAN loss fake_A = g_BtoA(real_B) disc_fake_A = d_A(fake_A) fake_B = g_AtoB(real_A) disc_fake_B = d_B(fake_B) replayBufferA.push(torch.tensor(fake_A.data)) replayBufferB.push(torch.tensor(fake_B.data)) target_real = Variable(torch.ones_like(disc_fake_A)).type(dtype) target_fake = Variable(torch.zeros_like(disc_fake_A)).type(dtype) loss_gan_AtoB = criterion_mse(disc_fake_B, target_real) loss_gan_BtoA = criterion_mse(disc_fake_A, target_real) loss_gan = loss_gan_AtoB + loss_gan_BtoA # cyclic reconstruction loss cyclic_A = g_BtoA(fake_B) cyclic_B = g_AtoB(fake_A) loss_cyclic_AtoBtoA = criterion_l1(cyclic_A, real_A) * CYCLIC_WEIGHT loss_cyclic_BtoAtoB = criterion_l1(cyclic_B, real_B) * CYCLIC_WEIGHT loss_cyclic = loss_cyclic_AtoBtoA + loss_cyclic_BtoAtoB # identity loss loss_identity = 0 loss_identity_A = 0 loss_identity_B = 0 if (args.use_identity == True): identity_A = g_BtoA(real_A) identity_B = g_AtoB(real_B) loss_identity_A = criterion_l1(identity_A, real_A) * 0.5 * CYCLIC_WEIGHT loss_identity_B = criterion_l1(identity_B, real_B) * 0.5 * CYCLIC_WEIGHT loss_identity = loss_identity_A + loss_identity_B loss_generators = loss_gan + loss_cyclic + loss_identity loss_generators.backward() optimizer_generators.step() # ----------------- # train discriminators # ----------------- optimizer_d_A.zero_grad() utils.learning_rate_decay(INITIAL_LR, e, EPOCHS, optimizer_d_A) fake_A = replayBufferA.sample(1).detach() disc_fake_A = d_A(fake_A) disc_real_A = d_A(real_A) loss_d_A = 0.5 * (criterion_mse(disc_real_A, target_real) + criterion_mse(disc_fake_A, target_fake)) loss_d_A.backward() optimizer_d_A.step() optimizer_d_B.zero_grad() utils.learning_rate_decay(INITIAL_LR, e, EPOCHS, optimizer_d_B) fake_B = replayBufferB.sample(1).detach() disc_fake_B = d_B(fake_B) disc_real_B = d_B(real_B) loss_d_B = 0.5 * (criterion_mse(disc_real_B, target_real) + criterion_mse(disc_fake_B, target_fake)) loss_d_B.backward() optimizer_d_B.step() #log info and save sample images if ((idx % 250) == 0): # eval on some sample images g_AtoB.eval() g_BtoA.eval() test_B_hat = g_AtoB(test_data_A).cpu() test_A_hat = g_BtoA(test_data_B).cpu() fileBaseStr = 'test_%d_%d' % (e, idx) for i in range(NUM_TEST_SAMPLES): fileStrA = 'visualization/test_%d/%s/%03d_%04d.png' % ( i, 'B_inStyleofA', e, idx) fileStrB = 'visualization/test_%d/%s/%03d_%04d.png' % ( i, 'A_inStyleofB', e, idx) utils.save_image(fileStrA, test_A_hat[i].data) utils.save_image(fileStrB, test_B_hat[i].data) g_AtoB.train() g_BtoA.train() endTime = time.time() timeForIntervalIterations = endTime - startTime startTime = endTime print( 'Epoch [{:3d}/{:3d}], Training [{:4d}/{:4d}], Time Spent (s): [{:4.4f}], Losses: [G_GAN: {:4.4f}][G_CYC: {:4.4f}][G_IDT: {:4.4f}][D_A: {:4.4f}][D_B: {:4.4f}]' .format(e, EPOCHS, idx, len(dataloader), timeForIntervalIterations, loss_gan, loss_cyclic, loss_identity, loss_d_A, loss_d_B)) # tensorboard logging info = { 'loss_generators': loss_generators.item(), 'loss_gan_AtoB': loss_gan_AtoB.item(), 'loss_gan_BtoA': loss_gan_BtoA.item(), 'loss_cyclic_AtoBtoA': loss_cyclic_AtoBtoA.item(), 'loss_cyclic_BtoAtoB': loss_cyclic_BtoAtoB.item(), 'loss_cyclic': loss_cyclic.item(), 'loss_d_A': loss_d_A.item(), 'loss_d_B': loss_d_B.item(), 'lr_optimizer_generators': optimizer_generators.param_groups[0]['lr'], 'lr_optimizer_d_A': optimizer_d_A.param_groups[0]['lr'], 'lr_optimizer_d_B': optimizer_d_B.param_groups[0]['lr'], } if (args.use_identity): info['loss_identity_A'] = loss_identity_A.item() info['loss_identity_B'] = loss_identity_B.item() for tag, value in info.items(): logger.scalar_summary(tag, value, step) info = { 'test_A_hat': test_A_hat.data.numpy().transpose(0, 2, 3, 1), 'test_B_hat': test_B_hat.data.numpy().transpose(0, 2, 3, 1), } for tag, images in info.items(): logger.image_summary(tag, images, step) step += 1 # save after every epoch g_AtoB.eval() g_BtoA.eval() d_A.eval() d_B.eval() if use_cuda: g_AtoB.cpu() g_BtoA.cpu() d_A.cpu() d_B.cpu() if not os.path.exists("models"): os.makedirs("models") filename_gAtoB = "models/" + str('g_AtoB') + "_epoch_" + str( e) + ".model" filename_gBtoA = "models/" + str('g_BtoA') + "_epoch_" + str( e) + ".model" filename_dA = "models/" + str('d_A') + "_epoch_" + str(e) + ".model" filename_dB = "models/" + str('d_B') + "_epoch_" + str(e) + ".model" torch.save(g_AtoB.state_dict(), filename_gAtoB) torch.save(g_BtoA.state_dict(), filename_gBtoA) torch.save(d_A.state_dict(), filename_dA) torch.save(d_B.state_dict(), filename_dB) if use_cuda: g_AtoB.cuda() g_BtoA.cuda() d_A.cuda() d_B.cuda()
def update(self): self.global_step = tf.get_variable('global_step', initializer=0, dtype=tf.int32, trainable=False) self.generator_lr = learning_rate_decay(hp.G_LR, global_step=self.global_step) self.discriminator_lr = learning_rate_decay( hp.D_LR, global_step=self.global_step) # Generator loss self.reconstruction_loss = tf.reduce_mean( tf.abs(self.ori_out - self.ori_feat)) self.cycle_loss = tf.reduce_mean( tf.abs(self.cycle_ori_out - self.ori_feat)) self.ori_kl_loss = -0.5 * tf.reduce_sum(1 + self.ori_log_var - tf.pow(self.ori_mu, 2) - tf.exp(self.ori_log_var)) self.aim_kl_loss = -0.5 * tf.reduce_sum(1 + self.aim_log_var - tf.pow(self.aim_mu, 2) - tf.exp(self.aim_log_var)) self.cycle_kl_loss = -0.5 * tf.reduce_sum(1 + self.cycle_log_var - tf.pow(self.cycle_mu, 2) - tf.exp(self.cycle_log_var)) self.kl_loss_weight = control_weight(self.global_step) self.kl_loss = self.kl_loss_weight * ( self.ori_kl_loss + self.aim_kl_loss + self.cycle_kl_loss) self.GAN_G_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.t_G, logits=self.predict_fake_P)) self.G_loss = self.reconstruction_loss + self.cycle_loss + self.kl_loss + self.GAN_G_loss # Discriminator loss self.D_fake_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.t_D_fake, logits=self.predict_fake_P)) self.D_real_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=self.t_D_real, logits=self.predict_fake_P)) self.GAN_D_loss = self.D_fake_loss + self.D_real_loss self.D_loss = self.GAN_D_loss # Variables trainable_variables = tf.trainable_variables() self.G_vars = [ var for var in trainable_variables if 'generator' in var.name ] self.D_vars = [ var for var in trainable_variables if 'discriminator' in var.name ] # Optimizer self.G_optimizer = tf.train.AdamOptimizer(self.generator_lr) self.D_optimizer = tf.train.AdamOptimizer(self.discriminator_lr) # Generator Gradient Clipping And Update self.G_clipped = [] self.G_gvs = self.G_optimizer.compute_gradients(self.G_loss, var_list=self.G_vars) for grad, var in self.G_gvs: grad = tf.clip_by_norm(grad, 5.) self.G_clipped.append((grad, var)) self.G_train_op = self.G_optimizer.apply_gradients( self.G_clipped, global_step=self.global_step) # Discriminator Gradient Clipping And Update self.D_clipped = [] self.D_gvs = self.D_optimizer.compute_gradients(self.D_loss, var_list=self.D_vars) for grad, var in self.D_gvs: grad = tf.clip_by_norm(grad, 5.) self.D_clipped.append((grad, var)) self.D_train_op = self.D_optimizer.apply_gradients( self.D_clipped, global_step=self.global_step)
def run(): # Training settings parser = argparse.ArgumentParser(description="PyTorch Environment") train_parser = parser.add_argument_group("Train Parameters") train_parser.add_argument("--epochs", type=int, default=160, metavar="E", help="number of epochs to train (default: 10)") train_parser.add_argument( "--batch-size", type=int, default=128, metavar="B", help="input batch size for training (default: 128)") train_parser.add_argument( "--test-batch-size", type=int, default=128, metavar="BT", help="input batch size for testing (default: 128)") train_parser.add_argument("--lr_decay", type=float, default=0.1, metavar="LD", help="learning rate decay rate") train_parser.add_argument("--schedule", type=int, nargs="*", default=[80, 120], help="learning rate is decayed at these epochs") train_parser.add_argument("--warmup-epochs", type=int, default=5, metavar="WE", help="number of warmup epochs") train_parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") train_parser.add_argument( "--seed", type=int, default=7186021514134990023, metavar="S", help="random seed (default: 7186021514134990023)") simulator_parser = parser.add_argument_group("Simulator Parameters") simulator_parser.add_argument("--sim-size", type=int, default=16, metavar="N", help="size of simulator") simulator_parser.add_argument("--sim-gamma-shape", type=float, default=100, metavar="GSH", help="gamma shape parameter") simulator_parser.add_argument("--sim-gamma-scale", type=float, default=1.28, metavar="GSC", help="gamma scale parameter") optimizer_parser = parser.add_argument_group("Optimizer Parameters") optimizer_parser.add_argument("--lr", type=float, default=0.1, metavar="LR", help="learning rate (default: 0.1)") optimizer_parser.add_argument("--momentum", type=float, default=0.9, metavar="M", help="SGD momentum (default: 0.9)") optimizer_parser.add_argument("--dc", type=float, default=2, metavar="DC", help="Delay Compensation (default: 0)") optimizer_parser.add_argument("--weight-decay", type=float, default=1e-4, metavar="WD", help="SGD weight decay (default: 0)") args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) random.seed(torch.initial_seed()) print("*** Configuration ***") for k in vars(args): print(str(k), ":", str(getattr(args, k))) train_set, test_set = get_cifar_10_data_set( ) # get CIFAR-10 train and test set args.train_loader = data_loader(train_set, is_train=True, args=args) args.test_loader = data_loader(test_set, is_train=False, args=args) args.model = resnet20_cifar() # get ResNet-20 Model if args.cuda: args.model = args.model.cuda() args.loss_fn = nn.CrossEntropyLoss() # use cross-entropy loss # create optimizer args.optimizer = optim.SGD(args.model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) assert len(args.optimizer.param_groups) == 1 # initialize optimizer's momentum for p in args.model.parameters(): args.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data) # clone weights for master args.master_weights = init_weights(args.model.parameters()) # clone weights, one for each worker args.worker_weights = [ init_weights(args.model.parameters()) for _ in range(args.sim_size) ] # clone optimizer, one for each worker args.worker_momentum = [ init_momentum(args.model.parameters()) for _ in range(args.sim_size) ] # create the gamma distribution order args.worker_order = iter(GammaRandomWorkerSelection(args)) # initialize dana args.momentum_sum = { id(p): torch.zeros_like(p) for p in args.model.parameters() } # initialize warmup args.warmup_lr = np.linspace(args.lr / args.sim_size, args.lr, len(args.train_loader) * args.warmup_epochs).tolist() print("*** Training with DANA-DC ***") for epoch in range(args.epochs): learning_rate_decay(epoch, args) train(epoch, args) evaluate(epoch, args)