def train(x, y): # train one iteration # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) batch_size = config['batch_size'] counter = 0 EPSILON = config['magic_epsilon'] # If accumulating gradients, loop multiple times before an optimizer step optim = D.module.optim if isinstance(D, nn.DataParallel) else D.optim optim.zero_grad() tP_mean = 0. tP_bar_list = [] for accumulation_index in range(config['num_D_accumulations']): y_bar = y[counter][torch.randperm(batch_size), ...] out, out_mi, out_c, tP, tP_bar, tQ, tQ_bar = D(x[counter], y[counter], y_bar, add_bias=True) tP_mean += torch.mean(tP) / float(config['num_D_accumulations']) tP_bar_list.append(tP_bar) counter += 1 tP_bar = torch.cat(tP_bar_list) tP_bar_max = tP_bar.max().detach() log_mean_etP_bar = tP_bar_max + torch.log( torch.mean(torch.exp(tP_bar - tP_bar_max))) MI_P = tP_mean - log_mean_etP_bar (-MI_P).backward() # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) optim.step() out = {'MI': utils.get_tensor_item(MI_P)} return out
def OnPaintGL( self ): glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT ) width, height = self.GetGLExtents() if RENDER_BACKGROUND: MVP = ortho( 0,width, 0, height, -1, 1 ) shaders.glUseProgram( self.bgshader ) glUniformMatrix4fv( glGetUniformLocation(self.bgshader, 'MVP'), 1, True, MVP ) self.bgvbo.bind() glEnableClientState( GL_VERTEX_ARRAY ); glVertexPointerf( self.bgvbo ) glDrawArrays( GL_TRIANGLE_STRIP, 0, len( self.bgvbo ) ) self.bgvbo.unbind() glDisableClientState( GL_VERTEX_ARRAY ); if RENDER_FOREGROUND: shaders.glUseProgram( self.fgshader ) self.fgvbo.bind() glEnableVertexAttribArray( 0 ) glEnableVertexAttribArray( 1 ) glVertexAttribPointer( 0, 3, GL_FLOAT, GL_FALSE, 24, self.fgvbo ) glVertexAttribPointer( 1, 3, GL_FLOAT, GL_FALSE, 24, self.fgvbo+12 ) for uname, ucount, ufuncs in self.graph.uniforms.values(): UNIFORM_FUNCTION[ucount]( glGetUniformLocation(self.fgshader, uname), *ufuncs() ) for name, value in custom_vs_nodes.items(): value[4]( glGetUniformLocation(self.fgshader, name), *eval(value[3]) ) for name, value in custom_fs_nodes.items(): value[4]( glGetUniformLocation(self.fgshader, name), *eval(value[3]) ) glDrawArrays( GL_TRIANGLES, 0, len( self.fgvbo ) ) self.fgvbo.unbind() glDisableVertexAttribArray( 0 ) glDisableVertexAttribArray( 1 ) shaders.glUseProgram( 0 ) self.SwapBuffers()
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() # The fake class label lossy = torch.LongTensor(config['batch_size']) lossy = lossy.cuda() lossy.data.fill_( config['n_classes']) # index for fake just for loss for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations if config['mh_csc_loss'] or config['mh_loss']: D_loss_real = losses.crammer_singer_criterion( D_real, y[counter]) D_loss_fake = losses.crammer_singer_criterion( D_fake, lossy[:config['batch_size']]) else: D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real) D_loss = (D_loss_real + D_loss_fake) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): # reusing the same noise for CIFAR ... if config['resampling'] or (accumulation_index > 0): z_.sample_() y_.sample_() if config['fm_loss']: D_feat_fake, D_feat_real = GD(z_, y_, x[-1], None, train_G=True, split_D=config['split_D'], feat=True) fm_loss = torch.mean( torch.abs( torch.mean(D_feat_fake, 0) - torch.mean(D_feat_real, 0))) G_loss = fm_loss else: D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) if config['mh_csc_loss']: G_loss = losses.crammer_singer_complement_criterion( D_fake, lossy[:config['batch_size']]) / float( config['num_G_accumulations']) elif config['mh_loss']: D_feat_fake, D_feat_real = GD(z_, y_, x[-1], None, train_G=True, split_D=config['split_D'], feat=True) fm_loss = torch.mean( torch.abs( torch.mean(D_feat_fake, 0) - torch.mean(D_feat_real, 0))) oth_loss = losses.mh_loss(D_fake, y_[:config['batch_size']]) G_loss = (config['mh_fmloss_weight'] * fm_loss + config['mh_loss_weight'] * oth_loss) / float( config['num_G_accumulations']) else: G_loss = losses.generator_loss(D_fake) / float( config['num_G_accumulations']) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } # Return G's loss and the components of D's loss. return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter, counter2 = 0, 0 if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() if not config['conditional']: y_.zero_() y_counter = torch.zeros_like(y[counter]).to( y_.device).long() else: y_.sample_() y_counter = y[counter] real_samples = x[counter] D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], real_samples, y_counter, train_G=False, split_D=config['split_D']) _, firstGgg1g1G1g1G1 = GD3(z_[:config['batch_size']], y_[:config['batch_size']], train_G=False, return_G_z=True, split_D=config['split_D']) D_loss = discriminator_loss(D_fake, D_real, firstGgg1g1G1g1G1, firstGgg1g1G1g1G1) D_loss.backward() counter += 1 if config['D_ortho'] > 0.0: print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) G.optim.zero_grad() for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() if not config['conditional']: y_.zero_() _, fiFirstgvhgzagaGenerator = GD3(z_[:config['batch_size']], y_[:config['batch_size']], train_G=False, return_G_z=True, split_D=config['split_D']) seSecgvhgzagaGenerator = x[counter2] G_loss = generator_loss(D_fake, fiFirstgvhgzagaGenerator, seSecgvhgzagaGenerator, fiFirstgvhgzagaGenerator) / float( config['num_G_accumulations']) counter2 += 1 if config['G_ortho'] > 0.0: print('using modified ortho reg in G') utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() if config['ema']: ema.update(state_dict['itr']) out = {'G_loss': float(G_loss.item()), 'D_loss': float(D_loss.item())} return out
def train(x, y, ratio): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) ratio = torch.split(ratio, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() # only feed in 0's for y if "unconditional" if not config['conditional']: y_.zero_() y_counter = torch.zeros_like(y[counter]).to( y_.device).long() else: y_.sample_() y_counter = y[counter] D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y_counter, train_G=False, split_D=config['split_D']) # reweight discriminator loss # modified discriminator loss to reflect flattening coefficient D_loss_real, D_loss_fake = discriminator_loss( D_fake, D_real, ratio[counter], alpha=config['alpha']) D_loss = (D_loss_real + D_loss_fake) / \ float(config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() # NOTE: setting all labels to 0 to train as unconditional model if not config['conditional']: y_.zero_() D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) # we don't need to do anything for the generator loss G_loss = generator_loss(D_fake) / float( config['num_G_accumulations']) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in G print('using modified ortho reg in G') # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } # Return G's loss and the components of D's loss. return out
def train(x, y): G_batch_size = max(config['G_batch_size'], config['batch_size']) G.optim.zero_grad() D.optim.zero_grad() #Use latent optimization z_prime = lat_opt_ngd(G, D, z_, G_batch_size, y_) # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: toggle_grad(D, True) toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): D_fake, D_real = GD(z_prime[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = loss_hinge_dis(D_fake, D_real) D_loss = (D_loss_real + D_loss_fake) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: toggle_grad(D, False) toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): D_fake = GD(z_prime, y_, train_G=True, split_D=config['split_D']) G_loss = loss_hinge_gen(D_fake) / float( config['num_G_accumulations']) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } # Return G's loss and the components of D's loss. return out
def train(x, y): # train one iteration G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) half_size = config['batch_size'] counter = 0 MINE_weight = config['MINE_weight'] if config[ 'weighted_MINE_loss'] else 1.0 EPSILON = config['magic_epsilon'] # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() gy_bar = y_[torch.randperm(half_size), ...] if D.TQ or D.TP else None dy_bar = y[counter][torch.randperm(half_size), ...] if D.TP or D.TQ else None D_fake, D_real, mi, c_cls, tP, tP_bar, tQ, tQ_bar = GD( z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], gy_bar, dy_bar, train_G=False, split_D=config['split_D'], add_bias=True) # Compute components of D's loss, average them, and divide by the number of gradient accumulations D_loss_real, D_loss_fake = discriminator_loss(D_fake, D_real) C_loss = 0. MI_P = 0. MI_Q = 0. if config['loss_type'] == 'fCGAN': # MINE-P on real etP_bar = torch.mean(torch.exp(tP_bar[half_size:])) if D.ma_etP_bar is None: D.ma_etP_bar = etP_bar.detach().item() D.ma_etP_bar += config['ma_rate'] * ( etP_bar.detach().item() - D.ma_etP_bar) MI_P = torch.mean(tP[half_size:]) - torch.log( etP_bar + EPSILON) * etP_bar.detach() / D.ma_etP_bar # MINE-Q on fake etQ_bar = torch.mean(torch.exp(tQ_bar[:half_size])) if D.ma_etQ_bar is None: D.ma_etQ_bar = etQ_bar.detach().item() D.ma_etQ_bar += config['ma_rate'] * ( etQ_bar.detach().item() - D.ma_etQ_bar) MI_Q = torch.mean(tQ[:half_size]) - torch.log( etQ_bar + EPSILON) * etQ_bar.detach() / D.ma_etQ_bar if config['loss_type'] == 'MINE': # AC C_loss += F.cross_entropy(c_cls[half_size:], y[counter]) if config['train_AC_on_fake']: C_loss += F.cross_entropy(c_cls[:half_size], y_) # MINE-Q on fake etQ_bar = torch.mean(torch.exp(tQ_bar[:half_size])) if D.ma_etQ_bar is None: D.ma_etQ_bar = etQ_bar.detach().item() D.ma_etQ_bar += config['ma_rate'] * ( etQ_bar.detach().item() - D.ma_etQ_bar) MI_Q = torch.mean(tQ[:half_size]) - torch.log( etQ_bar + EPSILON) * etQ_bar.detach() / D.ma_etQ_bar if config['loss_type'] == 'Twin_AC': C_loss += F.cross_entropy(c_cls[half_size:], y[counter]) + F.cross_entropy( mi[:half_size], y_) if config['train_AC_on_fake']: C_loss += F.cross_entropy(c_cls[:half_size], y_) if config['loss_type'] == 'AC': C_loss += F.cross_entropy( c_cls[half_size:], y[counter]) # AC should be trained on fake also if config['train_AC_on_fake']: C_loss += F.cross_entropy(c_cls[:half_size], y_) D_loss = (D_loss_real + D_loss_fake + C_loss * config['AC_weight'] - (MI_P + MI_Q) * MINE_weight) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() gy_bar = y_[torch.randperm(half_size), ...] if D.TQ else None D_fake, mi, c_cls, tP, tP_bar, tQ, tQ_bar = GD( z_, y_, gy_bar=gy_bar, train_G=True, split_D=config['split_D'], return_G_z=False, add_bias=config['loss_type'] != 'fCGAN') C_loss = 0. MI_loss = 0. MI_Q_loss = 0. f_div = 0. if config['loss_type'] == 'fCGAN': # f-div f_div += (tQ - tP).mean() # rev-kl if config['loss_type'] == 'MINE': # AC C_loss += F.cross_entropy(c_cls, y_) # MINE-Q MI_Q_loss = torch.mean(tQ) - torch.log( torch.mean(torch.exp(tQ_bar)) + EPSILON) if config['loss_type'] == 'AC' or config['loss_type'] == 'Twin_AC': C_loss += F.cross_entropy(c_cls, y_) if config['loss_type'] == 'Twin_AC': MI_loss = F.cross_entropy(mi, y_) G_loss = generator_loss(D_fake) / float( config['num_G_accumulations']) C_loss = C_loss / float(config['num_G_accumulations']) MI_loss = MI_loss / float(config['num_G_accumulations']) MI_Q_loss = MI_Q_loss / float(config['num_G_accumulations']) f_div = f_div / float(config['num_G_accumulations']) (G_loss + (C_loss - MI_loss) * config['AC_weight'] + MI_Q_loss * config['MINE_weight'] + f_div * config['fCGAN_weight']).backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), 'C_loss': utils.get_tensor_item(C_loss), 'MI_loss': utils.get_tensor_item(MI_loss), 'f_div': utils.get_tensor_item(f_div), 'MI_P': utils.get_tensor_item(MI_P), 'MI_Q': utils.get_tensor_item(MI_Q) } # Return G's loss and the components of D's loss. return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 lambda_D = config['lambda_D'] lambda_G = config['lambda_G'] # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_scores, D_scores_rotate90, D_scores_rotate180, D_scores_rotate270, \ D_scores_croptl, D_scores_croptr, D_scores_cropbl, D_scores_cropbr, \ D_scores_translation, D_scores_cutout = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, policy=config['DiffAugment'], CR=config['CR'] > 0, CR_augment=config['CR_augment']) D_loss_CR = 0 if config['CR'] > 0: D_fake, D_real, D_real_aug = D_scores D_loss_CR = torch.mean( (D_real_aug - D_real)**2) * config['CR'] else: D_fake, D_real = D_scores # rotation D_fake_rotate90, D_real_rotate90 = D_scores_rotate90 D_fake_rotate180, D_real_rotate180 = D_scores_rotate180 D_fake_rotate270, D_real_rotate270 = D_scores_rotate270 # cropping D_fake_croptl, D_real_croptl = D_scores_croptl D_fake_croptr, D_real_croptr = D_scores_croptr D_fake_cropbl, D_real_cropbl = D_scores_cropbl D_fake_cropbr, D_real_cropbr = D_scores_cropbr # translation & cutout D_fake_translation, D_real_translation = D_scores_translation D_fake_cutout, D_real_cutout = D_scores_cutout # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real) # rotation D_loss_real_rotate90, D_loss_fake_rotate90 = losses.discriminator_loss( D_fake_rotate90, D_real_rotate90) D_loss_real_rotate180, D_loss_fake_rotate180 = losses.discriminator_loss( D_fake_rotate180, D_real_rotate180) D_loss_real_rotate270, D_loss_fake_rotate270 = losses.discriminator_loss( D_fake_rotate270, D_real_rotate270) # croping D_loss_real_croptl, D_loss_fake_croptl = losses.discriminator_loss( D_fake_croptl, D_real_croptl) D_loss_real_croptr, D_loss_fake_croptr = losses.discriminator_loss( D_fake_croptr, D_real_croptr) D_loss_real_cropbl, D_loss_fake_cropbl = losses.discriminator_loss( D_fake_cropbl, D_real_cropbl) D_loss_real_cropbr, D_loss_fake_cropbr = losses.discriminator_loss( D_fake_cropbr, D_real_cropbr) # translation and cutout D_loss_real_translation, D_loss_fake_translation = losses.discriminator_loss( D_fake_translation, D_real_translation) D_loss_real_cutout, D_loss_fake_cutout = losses.discriminator_loss( D_fake_cutout, D_real_cutout) D_loss = D_loss_real + D_loss_fake + D_loss_CR # rotation D_loss_rotate90 = D_loss_real_rotate90 + D_loss_fake_rotate90 D_loss_rotate180 = D_loss_real_rotate180 + D_loss_fake_rotate180 D_loss_rotate270 = D_loss_real_rotate270 + D_loss_fake_rotate270 # cropping D_loss_croptl = D_loss_real_croptl + D_loss_fake_croptl D_loss_croptr = D_loss_real_croptr + D_loss_fake_croptr D_loss_cropbl = D_loss_real_cropbl + D_loss_fake_cropbl D_loss_cropbr = D_loss_real_cropbr + D_loss_fake_cropbr # translation and cutout D_loss_translation = D_loss_real_translation + D_loss_fake_translation D_loss_cutout = D_loss_real_cutout + D_loss_fake_cutout D_loss = D_loss + lambda_D/4*(D_loss + D_loss_rotate90 + D_loss_rotate180 + D_loss_rotate270) \ + lambda_D/5*(D_loss + D_loss_croptl + D_loss_croptr + D_loss_cropbl + D_loss_cropbr) \ + lambda_D/2*(D_loss + D_loss_translation) \ + lambda_D/2*(D_loss + D_loss_cutout) D_loss = D_loss / float(config['num_D_accumulations']) D_loss.backward(retain_graph=True) counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() if not config['fix_G']: # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() D_fake, D_fake_rotate90, D_fake_rotate180, D_fake_rotate270, \ D_fake_croptl, D_fake_croptr, D_fake_cropbl, D_fake_cropbr, D_fake_translation, D_fake_cutout = GD(z_, y_, train_G=True, policy=config['DiffAugment']) G_loss_rotate0 = losses.generator_loss(D_fake) / float( config['num_G_accumulations']) # rotation G_loss_rotate90 = losses.generator_loss( D_fake_rotate90) / float(config['num_G_accumulations']) G_loss_rotate180 = losses.generator_loss( D_fake_rotate180) / float(config['num_G_accumulations']) G_loss_rotate270 = losses.generator_loss( D_fake_rotate270) / float(config['num_G_accumulations']) # cropping G_loss_croptl = losses.generator_loss(D_fake_croptl) / float( config['num_G_accumulations']) G_loss_croptr = losses.generator_loss(D_fake_croptr) / float( config['num_G_accumulations']) G_loss_cropbl = losses.generator_loss(D_fake_cropbl) / float( config['num_G_accumulations']) G_loss_cropbr = losses.generator_loss(D_fake_cropbr) / float( config['num_G_accumulations']) # translation and cutout G_loss_translation = losses.generator_loss( D_fake_translation) / float(config['num_G_accumulations']) G_loss_cutout = losses.generator_loss(D_fake_cutout) / float( config['num_G_accumulations']) G_loss = G_loss_rotate0 + lambda_G/4.*(G_loss_rotate0 + G_loss_rotate90 + G_loss_rotate180 + G_loss_rotate270) \ + lambda_G/5.*(G_loss_rotate0 + G_loss_croptl + G_loss_croptr + G_loss_cropbl + G_loss_cropbr) \ + lambda_G/2.*(G_loss_rotate0 + G_loss_translation) \ + lambda_G/2.*(G_loss_rotate0 + G_loss_cutout) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in G print('using modified ortho reg in G') # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho( G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()) if not config['fix_G'] else 0, 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), } if config['CR'] > 0: out['D_loss_CR'] = float(D_loss_CR.item()) # Return G's loss and the components of D's loss. return out
def train_mode_seeing(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_fake, D_fake_features, D_real, D_real_features = GD( z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = discriminator_loss(D_fake, D_real) D_loss = (D_loss_real + D_loss_fake) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. # print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) if config['clip_norm'] is not None: torch.nn.utils.clip_grad_norm_(D.parameters(), config['clip_norm']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() z1 = z_.data.clone().detach() D_fake1, _, fake_image1 = GD(z1, y_, train_G=True, split_D=config['split_D'], return_G_z=True) G_loss1 = generator_loss(D_fake1, D_real.detach()) / float( config['num_G_accumulations']) z_.sample_() z2 = z_.data.clone().detach() D_fake2, _, fake_image2 = GD(z2, y_, train_G=True, split_D=config['split_D'], return_G_z=True) G_loss2 = generator_loss(D_fake2, D_real.detach()) / float( config['num_G_accumulations']) G_loss_gan = G_loss1 + G_loss2 # mode seeking loss lz = torch.mean(torch.abs(fake_image2 - fake_image1)) / torch.mean( torch.abs(z2 - z1)) eps = 1 * 1e-5 loss_lz = 1 / (lz + eps) G_loss = G_loss_gan + loss_lz G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) if config['clip_norm'] is not None: torch.nn.utils.clip_grad_norm_(G.parameters(), config['clip_norm']) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } # Return G's loss and the components of D's loss. return out
def train(x, y): train_fns_c = getattr(config, 'train_fns_c') summary = {} summary_D = {} G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x.requires_grad_() x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_fake, D_real, G_z = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D'], return_G_z=True) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations r_logit_mean, f_logit_mean, wd, _ = \ losses.wgan_discriminator_loss(r_logit=D_real, f_logit=D_fake) # gpreal img_gp, gp = gan_losses.compute_grad2( d_out=D_real, x_in=x[counter], backward=True, gp_lambda=10. / config['num_D_accumulations'], return_grad=True) # losses.wgan_gpreal_gradient_penalty(x=x[counter], dy=y[counter], # f=GD) if train_fns_c.adv_train: r_logit_mean_adv = losses.adv_loss(netD=GD, img=x[counter], y=y[counter], gp_img=img_gp, adv_lr=0.01, retain_graph=True) summary_D['r_logit_mean_adv'] = r_logit_mean_adv if train_fns_c.use_bound: D_loss = (-wd + torch.relu(wd - float(config.bound))) / \ float(config['num_D_accumulations']) summary['bound'] = config.bound else: D_loss = (-wd) / float(config['num_D_accumulations']) D_loss.backward(retain_graph=True) counter += 1 summary_D['r_logit_mean'] = r_logit_mean.item() summary_D['f_logit_mean'] = f_logit_mean.item() summary['wd'] = wd.item() summary['gp'] = gp.mean() # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) G_f_logit_mean, G_loss = losses.wgan_generator_loss(f_logit=D_fake) G_loss = G_loss / float(config['num_G_accumulations']) G_loss.backward() summary_D['G_f_logit_mean'] = G_f_logit_mean.item() summary['G_loss'] = G_loss.item() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) myargs.textlogger.log(state_dict['itr'], **summary_D) # Return G's loss and the components of D's loss. return summary
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() E.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) # print("inside fns", x) print("split - x {}".format(len(x))) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) utils.toggle_grad(E, False) # print("inside train fns: config['num_D_steps']", config['num_D_steps']) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() # print("---------------------- counter {} ---------------".format(counter)) # print("x[counter] {}; y[counter] {}".format(x[counter].shape, y[counter].shape)) for accumulation_index in range(config['num_D_accumulations']): # Cornner case for the last batch if counter >= len(x): break D_fake, D_real = GDE(x[counter], y[counter], config, state_dict['itr'], img_pool, train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss( \ D_fake, D_real, config['clip']) D_loss = (D_loss_real + D_loss_fake) / \ float(config['num_D_accumulations']) print("D_loss: {}; D_fake {}, D_real {}".format(D_loss.item(), D_loss_fake.item(), D_loss_real.item())) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) # stop gradient for testing purpose if config['stop_gradient']: print("!!! D is not optimized since you turn on `stop_gradient`!!!!!!") else: D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) utils.toggle_grad(E, True) # Zero G/E's gradients by default before training G, for safety G.optim.zero_grad() E.optim.zero_grad() # If accumulating gradients, loop multiple times counter = 0 # reset counter for data split for accumulation_index in range(config['num_G_accumulations']): if counter >= len(x): break # print("---------------------- counter {} ---------------".format(counter)) output = GDE(x[counter], y[counter], config, state_dict['itr'], img_pool, train_G=True, split_D=config['split_D'], return_G_z=True) D_fake = output[0] G_z = output[2] mu, log_var = output[3], output[4] if len(output) == 6: G_additional = output[5] # print("checkpoint==========================") G_loss = losses.generator_loss( D_fake) / float(config['num_G_accumulations']) VAE_recon_loss = losses.vae_recon_loss(G_z, x[counter]) VAE_kld_loss = losses.vae_kld_loss(mu, log_var, config['clip']) GE_loss = G_loss + VAE_recon_loss * config['lambda_vae_recon'] + VAE_kld_loss * config['lambda_vae_kld'] # weights_TTs.mean() * config['lambda_spatial_transform_weights'] # log_loss_str = f"GE_loss {GE_loss.item()}; VAE_recon_loss {VAE_recon_loss.item()}; VAE_kld_loss {VAE_kld_loss.item()}; weights_TTs {weights_TTs.mean().item()}; " log_loss_str = f"GE_loss {GE_loss.item()}; VAE_recon_loss {VAE_recon_loss.item()}; VAE_kld_loss {VAE_kld_loss.item()} " # add G_additional loss if len(output) == 6: G_additional_loss = config['lambda_g_additional'] * G_additional.sum() GE_loss += G_additional_loss log_loss_str += f"G_additional {G_additional_loss.item()}" # print out loss print(log_loss_str) # optimization GE_loss.backward() counter += 1 # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in G print('using modified ortho reg in G') # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) # stop gradient for testing purpose if config['stop_gradient']: print("!!! G and E is not optimized since you turn on `stop_gradient`!!!!!!") else: G.optim.step() E.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = {'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), 'VAE_recon_loss': float(VAE_recon_loss.item()), 'VAE_KLD_loss': float(VAE_recon_loss.item())} # Return G's loss and the components of D's loss. return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() if not config['conditional']: y_.zero_() if counter < len(y): y_counter = torch.zeros_like(y[counter]).to( y_.device).long() else: y_.sample_() y_counter = y[counter] if counter < len(y): real_samples = x[counter] D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], real_samples, y_counter, train_G=False, split_D=config['split_D']) D_loss_real, D_loss_fake = discriminator_loss(D_fake, D_real) D_loss = D_loss_real + D_loss_fake D_loss.backward() counter += 1 if config['D_ortho'] > 0.0: print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) G.optim.zero_grad() for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() if not config['conditional']: y_.zero_() real_samples2 = x[0] D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) G_loss = generator_loss(D_fake, real_samples2, z_, G.forward( z_, y_)) / float(config['num_G_accumulations']) G_loss.backward() if config['G_ortho'] > 0.0: print('using modified ortho reg in G') utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_fake, D_real, mi, c_cls = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real) C_loss = 0 if config['loss_type'] == 'Twin_AC': C_loss += F.cross_entropy(c_cls[D_fake.shape[0]:] ,y[counter]) + F.cross_entropy(mi[:D_fake.shape[0]] ,y_) if config['loss_type'] == 'Twin_AC_M': C_loss += hinge_multi(c_cls[D_fake.shape[0]:], y[counter]) + hinge_multi(mi[:D_fake.shape[0]], y_) if config['loss_type'] == 'AC': C_loss += F.cross_entropy(c_cls[D_fake.shape[0]:] ,y[counter]) D_loss = (D_loss_real + D_loss_fake + C_loss*config['AC_weight']) / float(config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() for step_index in range(config['num_G_steps']): for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() D_fake, G_z, mi, c_cls = GD(z_, y_, train_G=True, split_D=config['split_D'], return_G_z=True) C_loss = 0 MI_loss = 0 if config['loss_type'] == 'AC' or config['loss_type'] == 'Twin_AC': C_loss = F.cross_entropy(c_cls, y_) if config['loss_type'] == 'Twin_AC': MI_loss = F.cross_entropy(mi, y_) if config['loss_type'] == 'Twin_AC_M': C_loss = hinge_multi(c_cls, y_,hinge=False) MI_loss = hinge_multi(mi, y_, hinge=False) G_loss = losses.generator_loss(D_fake) / float(config['num_G_accumulations']) C_loss = C_loss / float(config['num_G_accumulations']) MI_loss = MI_loss / float(config['num_G_accumulations']) (G_loss + (C_loss - MI_loss)*config['AC_weight']).backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = {'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), 'C_loss': C_loss, 'MI_loss': MI_loss} # Return G's loss and the components of D's loss. return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations # D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real) D_real_positive = [y[counter], config['n_classes']] # D_real_negative = (config['n_classes'] + 1,) if global_cfg.omni_loss.mode == 'only_p': assert 0, "deprecated" D_loss_real = omni_loss(pred=D_real, positive=D_real_positive, default_label=-1) elif global_cfg.omni_loss.mode == 'p_and_n': D_loss_real = omni_loss(pred=D_real, positive=D_real_positive, default_label=0) elif global_cfg.omni_loss.mode == 'one_side': D_loss_real = omni_loss(pred=D_real, positive=D_real_positive, default_label=-1) else: assert 0 D_fake_positive = (config['n_classes'] + 1,) # D_fake_negative = (y_[:config['batch_size']], config['n_classes']) if global_cfg.omni_loss.mode == 'only_p': D_loss_fake = omni_loss(pred=D_fake, positive=D_fake_positive, default_label=-1) elif global_cfg.omni_loss.mode == 'p_and_n': D_loss_fake = omni_loss(pred=D_fake, positive=D_fake_positive, default_label=0) elif global_cfg.omni_loss.mode == 'one_side': D_fake_negative = [y_[:config['batch_size']], config['n_classes']] D_loss_fake = omni_loss(pred=D_fake, positive=None, negative=D_fake_negative, default_label=-1) else: assert 0 D_loss = (D_loss_real + D_loss_fake) / float(config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() out = {'D_real_loss': D_loss_real.item(), 'D_fake_loss': D_loss_fake.item()} # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) # G_loss = losses.generator_loss(D_fake) G_fake_positive = (y_, config['n_classes']) # G_fake_negative = (config['n_classes'] + 1,) if global_cfg.omni_loss.mode == 'only_p': G_loss = omni_loss(pred=D_fake, positive=G_fake_positive, default_label=-1) elif global_cfg.omni_loss.mode == 'p_and_n': G_loss = omni_loss(pred=D_fake, positive=G_fake_positive, default_label=0) elif global_cfg.omni_loss.mode == 'one_side': G_loss = omni_loss(pred=D_fake, positive=G_fake_positive, default_label=-1) else: assert 0 G_loss = G_loss / float(config['num_G_accumulations']) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() out.update({'G_loss': G_loss.item(), }) # out['D_G_fake'] = D_fake.mean().item() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) if val_loaders is not None: val_x, val_y = next(val_loaders) val_x = val_x.cuda() val_y = val_y.cuda() with torch.no_grad(): D_val = D(val_x, val_y) D_val_positive = (val_y, config['n_classes']) # D_val_negative = (config['n_classes'] + 1,) if global_cfg.omni_loss.mode == 'only_p': D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, default_label=-1) elif global_cfg.omni_loss.mode == 'p_and_n': D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, default_label=0) elif global_cfg.omni_loss.mode == 'one_side': D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, default_label=-1) else: assert 0 # D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, negative=D_val_negative) out.update({'D_val_loss': D_val_loss.item(), }) default_dict.clear() default_dict['D_loss'].update(out) return default_dict
def train(x, y, this_iter): G.optim.zero_grad() D.optim.zero_grad() if E is not None: E.optim.zero_grad() if not (config['prior_type'] == 'default'): Prior.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 ######### Discriminator ############## # Optionally toggle D and G's "require_grad" if config['toggle_grads']: if E is not None: utils.toggle_grad(E, False) if not (config['prior_type'] == 'default'): utils.toggle_grad(Prior, False) utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_, y_ = Prior.sample_() D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D'], is_Enc=False) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = my_loss.discriminator_loss( D_fake, D_real) D_loss = (D_loss_real + D_loss_fake) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() ########## Generator ################3 # Optionally toggle "requires_grad" if config['toggle_grads']: if not (config['prior_type'] == 'default' ) and not this_iter % config['update_GMM_every_n']: utils.toggle_grad(Prior, True) utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() if not (config['prior_type'] == 'default'): Prior.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_, y_ = Prior.sample_() D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) G_loss = my_loss.generator_loss(D_fake) (G_loss / float(config['num_G_accumulations'])).backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() if not (config['prior_type'] == 'default') and not this_iter % config['update_GMM_every_n']: Prior.optim.step() ############# Encoder ########## if E is not None: # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, False) utils.toggle_grad(E, True) counter = 0 for step_index in range(config['num_E_steps']): # Zero G's gradients by default before training G, for safety E.optim.zero_grad() if not (config['prior_type'] == 'default'): Prior.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_E_accumulations']): z_, y_ = Prior.sample_() z_mu, z_lv = GD(z_, y_, train_G=False, split_D=config['split_D'], is_Enc=True) z_p = z_ if not config['is_latent_detach'] else z_.detach() E_loss = my_loss.log_likelihood(z_p, z_mu, z_lv) / float( config['lambda_encoder']) total_loss = E_loss if not (config['prior_type'] == 'default') and not this_iter % config[ 'update_GMM_every_n'] and step_index == 0: log_y_pred = Prior.latent_classification(z_) Prior_loss = my_loss.classification_loss( log_y_pred, y_) / float( config['num_E_accumulations']) total_loss += Prior_loss if config['is_loss3'] != 0: if config['is_loss3'] == -1: loss3 = torch.sum( (1 / float(config['lambda_encoder'])) * my_loss.log_gaussian(Prior.lv_c) / Prior.n_classes) total_loss += loss3 else: loss3 = torch.sum( config['is_loss3'] * my_loss.log_gaussian(Prior.lv_c) / (Prior.n_classes * Prior.dim_z)) total_loss += loss3 MSE_loss = torch.mean(torch.sum((z_ - z_mu).pow(2), dim=1)) (total_loss / float(config['num_E_accumulations'])).backward() counter += 1 # Optionally apply modified ortho reg in G if config['E_ortho'] > 0.0: print( 'using modified ortho reg in E' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(E, config['E_ortho']) E.optim.step() if not (config['prior_type'] == 'default') and not this_iter % config[ 'update_GMM_every_n'] and step_index == 0: acc_samples = torch.mean( (y_ == log_y_pred.argmax(1)).float()) Prior.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } # Return G's loss and the components of D's loss. if not (config['prior_type'] == 'default') and not this_iter % config['update_GMM_every_n']: out['P_acc_samples'] = float(acc_samples.item()) if E is not None: out['E_log_likelihood'] = float(E_loss.item()) out['E_MSE_loss'] = float(MSE_loss.item()) return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() inner_iter_count = 0 partial_test_input = 0 # How many chunks to split x and y into? #x = torch.split(x, config['batch_size']) #y = torch.split(y, config['batch_size']) #print('x len{}'.format(len(x))) #print('y len{}'.format(len(y))) #assert len(x) == config['num_D_accumulations'] == len(y) #D_fake, D_real, G_fake, gy = None, None, None, None # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() d_reals = None#[None for _ in x] g_fakes = None#[None for _ in x] #gys = [None for _ in x] #zs = [None for _ in x] #zs_.sample_() #ys_.sample_() #gy = ys_[:config['batch_size']] #z = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :5] if state_dict['epoch'] < 0: #for accumulation_index in range(config['num_D_accumulations']): # doesn't mean anything right now # for fb_iter in range(config['num_feedback_iter']): # if fb_iter == 0: # z_ = zs_[:config['batch_size']] # gy = ys_[:config['batch_size']] # print('z_ shape {}'.format(z_.shape)) # z_ = z_.view(zs_.size(0), 9, 8, 8)[:, :5] zs_.sample_() z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 8, 8)[:,20] # [:, :5] #z_ = z_.view(z_.size(0), -1) # zs[accumulation_index] = z # z_ = torch.cat([z, torch.zeros(zs_.size(0), 4, 8, 8).cuda()], 1) ys_.sample_() gy = ys_[:config['batch_size']] # gys[accumulation_index] = gy.detach() # else: # D_real = D_real#.repeat(1,3,1,1)# * g_fakes[accumulation_index] # print('zs_ shape 0 {}'.format(zs_.shape)) # print('\n\n\n\n') # print('r shape {}'.format(r.shape)) # print('g fake shape {}'.format(g_fakes[accumulation_index].shape)) # print('\n\n\n\n') # z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :8] # G_fake = nn.AvgPool2d(4)(g_fakes[accumulation_index]) # print('z shape 5 {}'.format(z_.shape)) # z_=z_[:,:3] # print('z shape 10 {}'.format(z_.shape)) # z_ = torch.cat([d_reals[accumulation_index], G_fake, zs[accumulation_index]], 1) # print('z shape 15 {}'.format(z_.shape)) # gy = gys[accumulation_index] D_fake, D_real, G_fake = GD(z_, gy, x=x,#[accumulation_index], dy=y,#[accumulation_index], train_G=False, split_D=config['split_D']) #print('D shape {}'.format(D_fake.shape)) #print('G fake shape {}'.format(nn.AvgPool2d(4)(G_fake).shape)) #print('D real shape {}'.format(D_real.shape)) #print('z shape {}'.format(z_.shape)) if state_dict['itr'] % 1000 == 0: ##and accumulation_index == 6: print('saving img') torchvision.utils.save_image(x.float().cpu(),#[accumulation_index].float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_xreal.jpg'.format( time, state_dict['itr']), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) torchvision.utils.save_image(D_fake.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_dfake.jpg'.format( time, state_dict['itr']), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) torchvision.utils.save_image(D_real.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_dreal.jpg'.format( time, state_dict['itr']), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) # d_reals[accumulation_index] = D_real.detach() # g_fakes[accumulation_index] = G_fake.detach() # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real) D_loss = (D_loss_real + D_loss_fake)# / float(config['num_D_accumulations']) D_loss.backward() # counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # D.optim.zero_grad() # Optionally toggle "requires_grad" else: for fb_iter in range(config['num_feedback_iter_D']): #for accumulation_index in range(config['num_D_accumulations']): #doesn't mean anything right now #for fb_iter in range(config['num_feedback_iter']): zs_.sample_() z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 32, 32)[:, :20] ys_.sample_() gy = ys_[:config['batch_size']] if fb_iter == 0: # z_ = zs_[:config['batch_size']] # gy = ys_[:config['batch_size']] #print('z_ shape {}'.format(z_.shape)) #z_ = z_.view(zs_.size(0), 9, 8, 8)[:, :5] #zs_.sample_() #z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 8, 8)[:, :20] #zs[accumulation_index] = z_ #print('three channel x input train D shape before {}'.format(x[:, :3].shape)) #init_x = nn.AvgPool2d(4)(x[:, :3]) init_x = x[:, :3] z_ = torch.cat([z_, init_x, torch.ones(zs_.size(0), 1, 32, 32).cuda()], 1) #print('three channel x input train D shape after {}'.format(nn.AvgPool2d(4)(x[:, :3]).shape)) #ys_.sample_() #gy = ys_[:config['batch_size']] #gys[accumulation_index] = gy.detach() else: #D_real = D_real#.repeat(1,3,1,1)# * g_fakes[accumulation_index] #print('zs_ shape 0 {}'.format(zs_.shape)) #print('\n\n\n\n') #print('r shape {}'.format(r.shape)) #print('g fake shape {}'.format(g_fakes[accumulation_index].shape)) #print('\n\n\n\n') #z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :8] g_fake = 0.1 * g_fake + 0.9 * init_x#[accumulation_index] #print('z shape 5 {}'.format(z_.shape)) #z_=z_[:,:3] # print('z shape 10 {}'.format(z_.shape)) # print('g fake shape 10 {}'.format(G_fake.shape)) # print('d real shape 10 {}'.format(d_reals.shape)) #z_ = torch.cat([zs[accumulation_index],d_reals[accumulation_index], G_fake,], 1) z_ = torch.cat([z_, g_fake, nn.functional.interpolate(d_reals, 32, mode='bilinear')#[accumulation_index] ,], 1) #z_ = z_.view(z_.size(0),-1) #print('z shape 15 {}'.format(z_.shape)) #gy = gys[accumulation_index] # if state_dict['itr'] % 42 == 0: # partial_test_input = partial_test_input + torch.cat([g_fakes, d_fakes]) D_fake, D_real, G_fake = GD(z_, gy, x=x,#[accumulation_index], dy=y,#[accumulation_index], train_G=False, split_D=config['split_D']) #print('D shape {}'.format(D_fake.shape)) if state_dict['itr'] % 1000 == 0:# and accumulation_index == 6: print('saving img') torchvision.utils.save_image(x.float().cpu(),#[accumulation_index].float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_xreal.jpg'.format( time, state_dict['itr'], fb_iter), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) torchvision.utils.save_image(G_fake.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_Gfake_d.jpg'.format( time,state_dict['itr'],fb_iter),nrow=int(D_fake.shape[0] ** 0.5),normalize=True) if fb_iter > 1: torchvision.utils.save_image(g_fake.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_gfake_d.jpg'.format( time,state_dict['itr'],fb_iter),nrow=int(D_fake.shape[0] ** 0.5),normalize=True) D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real) if not fb_iter == 0: # d_real_enforcement = losses.loss_enforcing(d_reals#[accumulation_index] # , D_real) # g_fakes_enforcement = losses.loss_enforcing(g_fakes #[accumulation_index] # , nn.AvgPool2d(4)(G_fake)) D_loss = (D_loss_real + D_loss_fake)# + 0.1 * d_real_enforcement)# / float(config['num_D_accumulations']) else: D_loss = (D_loss_real + D_loss_fake)# / float(config['num_D_accumulations']) #d_reals[accumulation_index] = D_real.detach() d_reals = D_real.detach() #g_fakes[accumulation_index] = nn.AvgPool2d(4)(G_fake).detach() g_fake = G_fake.detach() #g_fakes = G_fake.detach() # Compute components of D's loss, average them, and divide by # the number of gradient accumulations # D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real) # if not fb_iter == 0: # D_loss = (D_loss_real + D_loss_fake + d_real_enforcement + g_fakes_enforcement) / float(config['num_D_accumulations']) # else: # D_loss = (D_loss_real + D_loss_fake) / float(config['num_D_accumulations']) D_loss.backward() #counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. # print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() #D.optim.zero_grad() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() #d_fakes = [None for _ in range(config['num_G_accumulations'])] #g_fakes = [None for _ in range(config['num_G_accumulations'])] #gys = [None for _ in range(config['num_G_accumulations'])] #for fb_iter in range(config['num_feedback_iter']): # If accumulating gradients, loop multiple times d_fakes = None#[None for _ in x] g_fakes = None#[None for _ in x] #gys = [None for _ in x] #zs = [None for _ in x] if state_dict['epoch'] < 0: #for accumulation_index in range(config['num_G_accumulations']): # doesn't mean anything right now zs_.sample_() z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 32, 32)[:, :20] #zs[accumulation_index] = z_[:, :5] # z_ = torch.cat([z, torch.zeros(zs_.size(0), 4, 8, 8).cuda()],1) ys_.sample_() gy = ys_ #gys[accumulation_index] = gy.detach() # D_fake = D_fake.repeat(1,3,1,1) # z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :5] #G_fake = nn.AvgPool2d(4)(g_fakes[accumulation_index]) #z_ = torch.cat([d_fakes[accumulation_index], G_fake, zs[accumulation_index]], 1) # gy = gys[accumulation_index] z_ = z_.view(z_.size(0), -1) D_fake, G_z = GD(z=z_, gy=gy, train_G=True, split_D=config['split_D'], return_G_z=True) G_loss = losses.generator_loss(D_fake)# / float(config['num_G_accumulations']) G_loss.backward() if state_dict['itr'] % 1000 == 0:# and accumulation_index == 6: print('saving img') torchvision.utils.save_image(D_fake.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_dfake.jpg'.format( time, state_dict['itr'],), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) torchvision.utils.save_image(G_z.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_G_z.jpg'.format( time, state_dict['itr'],), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) #g_fakes[accumulation_index] = G_z.detach() #d_fakes[accumulation_index] = D_fake.detach() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # G.optim.zero_grad() else: for fb_iter in range(config['num_feedback_iter']): #for accumulation_index in range(config['num_G_accumulations']): #doesn't mean anything right now zs_.sample_() z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 32, 32)[:, :20] ys_.sample_() gy = ys_ if fb_iter <= 1: #zs_.sample_() #z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 8, 8)[:, :20] #zs[accumulation_index] = z_ #print('three channel x input train G shape before {}'.format(x.shape)) #init_x = nn.AvgPool2d(4)(x[:, :3]) init_x = x[:, :3] z_ = torch.cat([z_, init_x, torch.ones(zs_.size(0), 1, 32, 32).cuda()], 1) #print('three channel x input train G shape after {}'.format(nn.AvgPool2d(4)(x[:, :3]).shape)) #ys_.sample_() #gy = ys_ #gys[accumulation_index] = gy.detach() else: #D_fake = D_fake.repeat(1,3,1,1) #z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :5] #G_fake = g_fakes#[accumulation_index] g_fake = 0.05 * g_fakes + 0.95 * init_x # [accumulation_index] d_fakes = nn.functional.interpolate(d_fakes, 32, mode='bilinear')#[accumulation_index] #z_ = torch.cat([zs[accumulation_index], d_fakes[accumulation_index], G_fake, ], 1) z_ = torch.cat([z_, g_fake, d_fakes #[accumulation_index] ,], 1) if ((not (state_dict['itr'] % config['save_every'])) or (not (state_dict['itr'] % config['test_every']))): partial_test_input = partial_test_input + torch.cat([g_fake, d_fakes], 1) inner_iter_count = inner_iter_count + 1 #gy = gys[accumulation_index] #z_ = z_.view(z_.size(0), -1) D_fake, G_z = GD(z=z_, gy=gy, train_G=True, split_D=config['split_D'], return_G_z=True) if not fb_iter == 0: #g_fakes_enforcement = losses.loss_enforcing(g_fakes#[accumulation_index] #, G_z) # d_fakes_enforcement = losses.loss_enforcing(d_fakes#[accumulation_index] # , D_fake) G_loss = (losses.generator_loss(D_fake))# + 0.1 * g_fakes_enforcement) #/ float(config['num_G_accumulations']) else: G_loss = (losses.generator_loss(D_fake))# / float(config['num_G_accumulations']) G_loss.backward() if state_dict['itr'] % 1000 == 0:# and accumulation_index == 6: print('saving img') # torchvision.utils.save_image(D_fake.float().cpu(), # '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_dfake.jpg'.format(time, # state_dict['itr'], fb_iter), # nrow=int(D_fake.shape[0] ** 0.5), # normalize=True) torchvision.utils.save_image(G_z.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_G_z.jpg'.format(time, state_dict['itr'], fb_iter), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) if fb_iter > 1: torchvision.utils.save_image(g_fake.float().cpu(), '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_G_z_input.jpg'.format(time, state_dict['itr'], fb_iter), nrow=int(D_fake.shape[0] ** 0.5), normalize=True) #g_fakes[accumulation_index] = nn.AvgPool2d(4)(G_z).detach() g_fakes = G_z.detach() #g_fakes = G_z.detach() #d_fakes[accumulation_index] = D_fake.detach() d_fakes = D_fake.detach() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() #G.optim.zero_grad() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = {'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item())} # Return G's loss and the components of D's loss. partial_test_input = partial_test_input / (inner_iter_count + 1e-9) return out, partial_test_input
def train(x, y, epoch, batch_size, target_map = None, r_mixup = 0.0): G.optim.zero_grad() D.optim.zero_grad() if config["unet_mixup"]: real_target = torch.tensor([1.0]).cuda() fake_target = torch.tensor([0.0]).cuda() if config["unet_mixup"] and not config["full_batch_mixup"]: use_mixup_in_this_round = True elif config["unet_mixup"] and config["full_batch_mixup"]: use_mixup_in_this_round = torch.rand(1).detach().item()<r_mixup else: use_mixup_in_this_round = False out = {} skip_normal_real_fake_loss = (use_mixup_in_this_round and config["full_batch_mixup"] ) n_d_accu = config['num_D_accumulations'] split_size = int(x.size(0)/n_d_accu) x = torch.split(x, split_size) y = torch.split(y, split_size) d_real_target = torch.tensor([1.0]).cuda() d_fake_target = torch.tensor([0.0]).cuda() discriminator_loss = functools.partial(BCEloss, d_real_target=d_real_target, d_fake_target=d_fake_target) mix_fake_target = torch.tensor([1.0]).cuda() fake_loss = functools.partial(BCEfakeloss, target = mix_fake_target) # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): counter = 0 # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(n_d_accu): z_.sample_() y_.sample_() if use_mixup_in_this_round: if (not config["full_batch_mixup"]) or (config["full_batch_mixup"] and (config["consistency_loss_and_augmentation"] or config["consistency_loss"]) ): D_fake, D_real , D_mixed, G_z, mixed, D_middle_fake, D_middle_real, D_middle_mixed, target_map = GD(z_[:batch_size], y_[:batch_size], x[counter], y[counter], train_G=False, split_D=config['split_D'], mixup = True, target_map = target_map) # mixup can be true because weight is set to 0 when no mixup is used else: D_mixed, G_z, mixed, D_middle_mixed, target_map = GD(z_[:batch_size], y_[:batch_size], x[counter], y[counter], train_G=False, return_G_z = True, split_D=config['split_D'], mixup = True, mixup_only = True, target_map = target_map) if config["slow_mixup"] and not config["full_batch_mixup"]: mixup_coeff = min(1.0, epoch/config["warmup_epochs"] )#use without full batch mixup else: mixup_coeff = 1.0 if config["display_mixed_batch"]: # This can help for debugging plt.figure() m = torchvision.utils.make_grid(mixed,nrow=5,padding=2,normalize = True) m = m.permute(1,2,0) m = m.cpu().numpy() plt.imshow(m) plt.figure() plt.figure() m = torchvision.utils.make_grid(G_z,nrow=5,padding=2,normalize = True) m = m.permute(1,2,0) m = m.cpu().numpy() plt.imshow(m) plt.figure() plt.figure() m = torchvision.utils.make_grid(x[counter],nrow=5,padding=2,normalize = True) m = m.permute(1,2,0) m = m.cpu().numpy() plt.imshow(m) plt.figure() m = torchvision.utils.make_grid(target_map,nrow=5,padding=2) m = m.permute(1,2,0) m = m.cpu().numpy() plt.imshow(m) plt.title("mix") plt.show() plt.figure() else: D_fake, D_real , G_z, D_middle_fake, D_middle_real = GD(z_[:batch_size], y_[:batch_size], x[counter], y[counter], train_G=False, split_D=config['split_D']) if not skip_normal_real_fake_loss: D_loss_real_2d, D_loss_fake_2d = discriminator_loss(D_fake.view(-1), D_real.view(-1)) D_loss_real_2d_item = D_loss_real_2d.detach().item() D_loss_fake_2d_item = D_loss_fake_2d.detach().item() if use_mixup_in_this_round and (config["consistency_loss"] or config["consistency_loss_and_augmentation"]): mix = D_real*target_map + D_fake*(1-target_map) if use_mixup_in_this_round: D_mixed_flattened = D_mixed.view(-1) target_map_flattend = target_map.view(-1) mix_list = [] for i in range(D_mixed.size(0)): # MIXUP LOSS 2D mix2d_i= F.binary_cross_entropy_with_logits(D_mixed[i].view(-1),target_map[i].view(-1) ) mix_list.append(mix2d_i) D_loss_mixed_2d = torch.stack(mix_list) #-> D_loss_mixed_2d.mean() is taken later D_loss_mixed_2d_item = D_loss_mixed_2d.mean().detach().item() #D_loss_mixed_2d = D_loss_mixed_2d.view(D_mixed.size()).mean([2,3]) if not skip_normal_real_fake_loss: D_loss_real_middle, D_loss_fake_middle = discriminator_loss(D_middle_fake, D_middle_real) D_loss_real_middle_item = D_loss_real_middle.detach().item() D_loss_fake_middle_item = D_loss_fake_middle.detach().item() if use_mixup_in_this_round and not config["consistency_loss"]: # consistency loss is only concerned with segmenter output #target for mixed encoder loss is fake mix_bce = F.binary_cross_entropy_with_logits(D_middle_mixed, fake_target.expand_as(D_middle_mixed), reduction="none") mixed_middle_loss = mixup_coeff*mix_bce mixed_middle_loss_item = mixed_middle_loss.mean().detach().item() if skip_normal_real_fake_loss: D_loss_real = torch.tensor([0.0]).cuda() D_loss_fake = torch.tensor([0.0]).cuda() else: D_loss_real = D_loss_real_2d + D_loss_real_middle D_loss_fake = D_loss_fake_2d + D_loss_fake_middle D_loss_real_item = D_loss_real.detach().item() D_loss_fake_item = D_loss_fake.detach().item() D_loss = 0.5*D_loss_real + 0.5*D_loss_fake if use_mixup_in_this_round: if config["consistency_loss"] or config["consistency_loss_and_augmentation"]: consistency_loss = mixup_coeff*1.0*F.mse_loss(D_mixed, mix ) consistency_loss_item = consistency_loss.float().detach().item() if not config["consistency_loss"]: # GAN loss from cutmix augmentation (=/= consitency loss) mix_loss = D_loss_mixed_2d + mixed_middle_loss mix_loss = mix_loss.mean() else: mix_loss = 0.0 if config["consistency_loss"]: mix_loss = consistency_loss elif config["consistency_loss_and_augmentation"]: mix_loss = mix_loss + consistency_loss D_loss = D_loss + mix_loss D_loss = D_loss / float(config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() del D_loss # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) ###################################### # G-step ###################################### # Zero G's gradients by default before training G, for safety G.optim.zero_grad() counter = 0 z_.sample_() y_.sample_() z__ = torch.split(z_, split_size) #batch_size) y__ = torch.split(y_, split_size) #batch_size) # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): G_fake, G_fake_middle = GD(z__[counter], y__[counter], train_G=True, split_D=config['split_D'], reference_x = x[counter] ) G_loss_fake_2d = fake_loss(G_fake) G_loss_fake_middle = fake_loss(G_fake_middle) G_loss = 0.5*G_loss_fake_middle + 0.5*G_loss_fake_2d G_loss = G_loss / float(config['num_G_accumulations']) G_loss_fake_middle_item = G_loss_fake_middle.detach().item() G_loss_fake_2d_item = G_loss_fake_2d.detach().item() G_loss_item = G_loss.detach().item() G_loss.backward() counter += 1 # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() del G_loss # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) # save intermediate losses if use_mixup_in_this_round and (config["consistency_loss"] or config["consistency_loss_and_augmentation"]) and config["num_D_steps"]>0: out["consistency"] = float(consistency_loss_item) out['G_loss'] = float(G_loss_item) if not (config["full_batch_mixup"] and use_mixup_in_this_round) and config["num_D_steps"]>0: out['D_loss_real'] = float(D_loss_real_item) out['D_loss_fake'] = float(D_loss_fake_item) if use_mixup_in_this_round and not config["consistency_loss"] and config["num_D_steps"]>0: out["mixed_middle_loss"] = float(mixed_middle_loss_item) out["D_loss_mixed_2d"] = float(D_loss_mixed_2d_item) if not (config["full_batch_mixup"] and use_mixup_in_this_round): if config["num_D_steps"]>0: out["D_loss_real_middle"] = float(D_loss_real_middle_item) out["D_loss_fake_middle"] = float(D_loss_fake_middle_item) out["D_loss_real_2d"] = float(D_loss_real_2d_item) out["D_loss_fake_2d"] = float(D_loss_fake_2d_item) out["G_loss_fake_middle"] = float(G_loss_fake_middle_item) out["G_loss_fake_2d"] = float(G_loss_fake_2d_item) return out
def train(x_s, y, yd): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? y = y.long() yd = yd.long() x_s = torch.split(x_s, config['batch_size']) y = torch.split(y, config['batch_size']) yd = torch.split(yd, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() yd_.sample_() D_fake, D_real, mi, c_cls, mid, c_clsd, G_z = GD( z_, y_, yd_, x_s[counter], y[counter], yd[counter], train_G=False, split_D=config['split_D'], return_G_z=True) D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real) C_loss = 0 if config['AC']: fake_mi = mi[:D_fake.shape[0]] fake_cls = c_cls[:D_fake.shape[0]] c_cls_rs = c_cls[D_fake.shape[0]:] fake_mid = mid[:D_fake.shape[0]] c_clsd = c_clsd[D_fake.shape[0]:] # print(yd) # print(yd_) if config['loss_type'] == 'Twin_AC': C_loss += F.cross_entropy(c_clsd, yd[counter]) + F.cross_entropy(fake_mid, yd_) + \ 0.5*F.cross_entropy(c_cls_rs[yd[counter]!=0], y[counter][yd[counter]!=0]) + 0.5*F.cross_entropy(fake_cls, y_) + 1.0*F.cross_entropy(fake_mi, y_) # if state_dict['itr'] > 0000: # C_loss += 0.2*F.cross_entropy(c_cls_ft, y_[yd_!=0]) + 0.2*F.cross_entropy(fake_mi_t[yd_!=0], y_[yd_!=0])#F.cross_entropy(fake_mi[yd_ == 0], y_[yd_ == 0]) if config['loss_type'] == 'AC': C_loss += F.cross_entropy( c_cls_fs, y_f_s) + F.cross_entropy(c_clsd, yd) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations if config['Pac']: x_pack = torch.cat([x_s[counter], x_t[counter]], dim=0) T_img = x_pack.view(-1, 4 * x_pack.size()[1], x_pack.size()[2], x_pack.size()[3]) F_img = G_z.view(-1, 4 * G_z.size()[1], G_z.size()[2], G_z.size()[3]) pack_img = torch.cat([T_img, F_img], dim=0) pack_out, _, _ = D(pack_img, pack=True) D_real_pac = pack_out[:T_img.size()[0]] D_fake_pac = pack_out[T_img.size()[0]:] D_loss_real_pac, D_loss_fake_pac = losses.discriminator_loss( D_fake_pac, D_real_pac) D_loss_real += D_loss_real_pac D_loss_fake += D_loss_fake_pac D_loss = (D_loss_real + D_loss_fake + C_loss * config['AC_weight']) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() for step_index in range(config['num_G_steps']): for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() yd_.sample_() D_fake, mi, cls, mid, clsd, G_z = GD(z_, y_, yd_, train_G=True, split_D=config['split_D'], return_G_z=True) C_loss = 0 MI_loss = 0 CD_loss = 0 MID_loss = 0 G_loss = losses.generator_loss(D_fake) if config['loss_type'] == 'AC' or config[ 'loss_type'] == 'Twin_AC': C_loss = 1.0 * F.cross_entropy( cls, y_) #+ 0.5*F.cross_entropy(cls[yd_!=0], y_[yd_!=0]) CD_loss = F.cross_entropy(clsd, yd_) if config['loss_type'] == 'Twin_AC': MI_loss = 1.0 * F.cross_entropy(mi, y_) # if state_dict['itr'] > 0000: # MI_loss += 0.5*F.cross_entropy(mi_t[yd_!=0], y_[yd_!=0]) MID_loss = F.cross_entropy(mid, yd_) if config['Pac']: F_img = G_z.view(-1, 4 * G_z.size()[1], G_z.size()[2], G_z.size()[3]) D_fake_pac, _, _ = D(F_img, pack=True) G_loss_pac = losses.generator_loss(D_fake_pac) G_loss += G_loss_pac G_loss = G_loss / float(config['num_G_accumulations']) C_loss = C_loss / float(config['num_G_accumulations']) MI_loss = MI_loss / float(config['num_G_accumulations']) CD_loss = CD_loss / float(config['num_G_accumulations']) MID_loss = MID_loss / float(config['num_G_accumulations']) (G_loss + (C_loss - MI_loss + CD_loss - MID_loss) * config['AC_weight']).backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho( G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), 'C_loss': C_loss, 'MI_loss': MI_loss, 'CD_loss': CD_loss, 'MID_loss': MID_loss } # Return G's loss and the components of D's loss. return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() D_scores = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, policy=config['DiffAugment'], CR=config['CR'] > 0, CR_augment=config['CR_augment']) D_loss_CR = 0 if config['CR'] > 0: # to do continue else: D_fake, D_real = D_scores # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real) D_loss = D_loss_real + D_loss_fake + D_loss_CR D_loss = D_loss / float(config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() if not config['fix_G']: # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() D_fake = GD(z_, y_, train_G=True, policy=config['DiffAugment']) G_loss = losses.generator_loss(D_fake) / float( config['num_G_accumulations']) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in G print('using modified ortho reg in G') # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho( G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = { 'G_loss': float(G_loss.item()) if not config['fix_G'] else 0, 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), } if config['CR'] > 0: out['D_loss_CR'] = float(D_loss_CR.item()) # Return G's loss and the components of D's loss. return out
def train(x, y, stage): G.optim.zero_grad() D.optim.zero_grad() M.optim.zero_grad() # yaxing # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: # yaxing: hert it is True utils.toggle_grad(D, True) utils.toggle_grad(G, False) utils.toggle_grad(M, False) # yaxing for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() # yaxing: set gy and dy is equal 0, since we donot know label D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real) D_loss = (D_loss_real + D_loss_fake) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # yaxing: hert it is 0.0 # Debug print to indicate we're using ortho reg in D. print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) if stage == 1: utils.toggle_grad(G, False) # yaxing else: utils.toggle_grad(G, True) # yaxing utils.toggle_grad(M, True) # yaxing # Zero G's gradients by default before training G, for safety G.optim.zero_grad() M.optim.zero_grad() # yaxing # If accumulating gradients, loop multiple times for accumulation_index in range( config['num_G_accumulations']): # yaxing: hert it is 1 z_.sample_() y_.sample_() #D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) # yaxing: set gy and dy is equal 0, since we donot know label D_fake, M_regu = GD(z_, y_, train_G=True, split_D=config['split_D'], train_M=True, M_regu=True) #G_loss = losses.generator_loss(D_fake) / float(config['num_G_accumulations']) M_loss = losses.generator_loss(D_fake, M_regu) / float( config['num_G_accumulations']) #pdb.set_trace() #G_loss.backward() M_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # yaxing: hert it is 0.0 print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) if stage == 2: G.optim.step() M.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) #out = {'G_loss': float(G_loss.item()), out = { 'G_loss': float(M_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } # Return G's loss and the components of D's loss. return out
def train(x, y): G.optim.zero_grad() D.optim.zero_grad() # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an # optimizer step D.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_, y_ = sample() D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D']) # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real) D_loss = (D_loss_real + D_loss_fake) / \ float(config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. xm.master_print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) xm.optimizer_step(D.optim) # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_, y_ = sample() D_fake = GD(z_, y_, train_G=True, split_D=config['split_D']) G_loss = losses.generator_loss( D_fake) / float(config['num_G_accumulations']) G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in G print('using modified ortho reg in G') # Don't ortho reg shared, it makes no sense. Really we should # blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) xm.optimizer_step(G.optim) # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) out = {'G_loss': G_loss, 'D_loss_real': D_loss_real, 'D_loss_fake': D_loss_fake} # Return G's loss and the components of D's loss. return out
def train(x, y, tensor_writer=None, iteration=None): print('Summation will be taken', config['D_hinge_loss_sum'], 'D hinge loss') G.optim.zero_grad() D.optim.zero_grad() if config['no_Dv'] == False: Dv.optim.zero_grad() if tensor_writer != None and iteration % config[ 'log_results_every'] == 0: tensor_writer.add_video('Loaded Data', (x + 1) / 2, iteration) mean_pixel_val = torch.mean((x + 1) / 2, dim=[0, 1, 3, 4]) tensor_writer.add_scalar( 'Pixel vals/Mean Red Pixel values, real data', float(mean_pixel_val[0].item()), iteration) tensor_writer.add_scalar( 'Pixel vals/Mean Green Pixel values, real data', float(mean_pixel_val[1].item()), iteration) tensor_writer.add_scalar( 'Pixel vals/Mean Blue Pixel values, real data', float(mean_pixel_val[2].item()), iteration) y_text = [] for yi in y: y_text.append(idx_to_classes[yi.item()]) tensor_writer.add_text('Loaded Labels', ' | '.join(y_text), iteration) #Added by Xiaodan: prepare for avg pixel loss if config['no_avg_pixel_loss'] == False: mean_pixel_val_real = torch.mean((x + 1) / 2) # print('Range of loaded data:',x.min(),'--',x.max()) # How many chunks to split x and y into? x = torch.split(x, config['batch_size']) y = torch.split(y, config['batch_size']) counter = 0 # Optionally toggle D and G's "require_grad" if config['toggle_grads']: utils.toggle_grad(D, True) if config['no_Dv'] == False: utils.toggle_grad(Dv, True) utils.toggle_grad(G, False) for step_index in range(config['num_D_steps']): # If accumulating gradients, loop multiple times before an optimizer step D.optim.zero_grad() if config['no_Dv'] == False: Dv.optim.zero_grad() for accumulation_index in range(config['num_D_accumulations']): z_.sample_() y_.sample_() # print('z_ size in GAN tranining func:',z_.shape) # print('y_ size in GAN tranining func:',y_.shape) #xiaodan: D_fake, D_real [B*8,1] # print('hier and G_shared:',config['hier'],config['G_shared']) # print('Shape of z_[:config[batch_size]]:',z_[:config['batch_size']].shape) # print('config[batch_size]',config['batch_size']) if config['no_Dv'] == False: D_fake, D_real, Dv_fake, Dv_real, G_z = GD( z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D'], tensor_writer=tensor_writer, iteration=iteration) else: D_fake, D_real, G_z = GD(z_[:config['batch_size']], y_[:config['batch_size']], x[counter], y[counter], train_G=False, split_D=config['split_D'], tensor_writer=tensor_writer, iteration=iteration) # print('GD.k in train_fns line 49',GD.module.k) #GD.module because GD is now dataparallel class # D_fake & D_real shapes: [Bk,1], [Bk,1] # xiaodan: Make scores back to [B,k,1] for easier summation in discriminator_loss D_fake = D_fake.contiguous().view(-1, GD.module.k, *D_fake.shape[1:]) #[B,k,1] D_real = D_real.contiguous().view(-1, GD.module.k, *D_real.shape[1:]) #[B,k,1] if config['D_hinge_loss_sum'] == 'before': D_fake = torch.sum( D_fake, 1 ) #xiaodan: add k scores before doing hinge loss, according to the paper D_real = torch.sum(D_real, 1) #[B,1] # Compute components of D's loss, average them, and divide by # the number of gradient accumulations D_loss_real, D_loss_fake = losses.discriminator_loss( D_fake, D_real, config['D_hinge_loss_sum']) # Dv_fake & Dv_real shapes: [BT*,1], [BT*,1] if T_into_B; [B,1], [B,1] if False if config['no_Dv'] == False: # print('Dv_fake shape',Dv_fake.shape) if config['T_into_B'] == True: Dv_fake = Dv_fake.contiguous().view( D_fake.shape[0], -1, *Dv_fake.shape[1:]) #[B,T*,1] Dv_real = Dv_real.contiguous().view( D_real.shape[0], -1, *Dv_real.shape[1:]) #[B,T*,1] if config['Dv_hinge_loss_sum'] == 'before': Dv_fake = torch.sum( Dv_fake, 1 ) #xiaodan: add T* scores before doing hinge loss Dv_real = torch.sum(Dv_real, 1) #[B,1] Dv_loss_real, Dv_loss_fake = losses.discriminator_loss( Dv_fake, Dv_real, config['Dv_hinge_loss_sum']) else: #Xiaodan: If T_into_B is False, must use "before" for hinge loss. Dv_loss_real, Dv_loss_fake = losses.discriminator_loss( Dv_fake, Dv_real, 'before') D_loss = (D_loss_real + D_loss_fake + Dv_loss_fake + Dv_loss_real) / float( config['num_D_accumulations']) else: D_loss = (D_loss_real + D_loss_fake) / float( config['num_D_accumulations']) D_loss.backward() counter += 1 # Optionally apply ortho reg in D if config['D_ortho'] > 0.0: # Debug print to indicate we're using ortho reg in D. if config['no_Dv'] == False: print('using modified ortho reg in D and Dv') utils.ortho(Dv, config['D_ortho']) else: print('using modified ortho reg in D') utils.ortho(D, config['D_ortho']) D.optim.step() if config['no_Dv'] == False: Dv.optim.step() # Optionally toggle "requires_grad" if config['toggle_grads']: utils.toggle_grad(D, False) if config['no_Dv'] == False: utils.toggle_grad(Dv, False) utils.toggle_grad(G, True) # Zero G's gradients by default before training G, for safety G.optim.zero_grad() # If accumulating gradients, loop multiple times for accumulation_index in range(config['num_G_accumulations']): z_.sample_() y_.sample_() # print('z_,y_ shapes before pass into GD:',z_.shape,y_.shape) if config['no_Dv'] == False: D_fake, Dv_fake, G_z = GD(z_, y_, train_G=True, split_D=config['split_D'], tensor_writer=tensor_writer, iteration=iteration) else: D_fake, G_z = GD(z_, y_, train_G=True, split_D=config['split_D'], tensor_writer=tensor_writer, iteration=iteration) D_fake = D_fake.contiguous().view(-1, GD.module.k, *D_fake.shape[1:]) #[B, k, 1] D_fake = torch.mean( D_fake, 1) # [B,1] xiaodan: average k scores before doing hinge loss G_loss = config['D_loss_weight'] * losses.generator_loss( D_fake) / float(config['num_G_accumulations']) if config['no_Dv'] == False: if config['T_into_B'] == True: Dv_fake = Dv_fake.contiguous().view( D_fake.shape[0], -1, *Dv_fake.shape[1:]) #[B,T*,1] Dv_fake = torch.mean(Dv_fake, 1) # [B,1] G_loss += losses.generator_loss(Dv_fake) / float( config['num_G_accumulations']) #Added by Xiaodan to take avg. pixel value into account as an additional losses # print(type(G_loss)) if config['no_avg_pixel_loss'] == False: mean_pixel_val_fake = torch.mean((G_z + 1) / 2) mean_pixel_val_diff = abs( float(mean_pixel_val_fake.item()) - float(mean_pixel_val_real.item())) mean_pixel_loss = losses.avg_pixel_loss( mean_pixel_val_diff, config['avg_pixel_loss_weight']) / float( config['num_G_accumulations']) if iteration >= config['pixel_loss_kicksin']: G_loss += mean_pixel_loss else: mean_pixel_loss = 0 G_loss.backward() # Optionally apply modified ortho reg in G if config['G_ortho'] > 0.0: print('using modified ortho reg in G' ) # Debug print to indicate we're using ortho reg in G # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this utils.ortho(G, config['G_ortho'], blacklist=[param for param in G.shared.parameters()]) if config['no_convgru'] == False: G_grad_gates = G.convgru.convgru.cell_list[ 0].conv_gates.weight.grad.abs().sum() G_grad_can = G.convgru.convgru.cell_list[ 0].conv_can.weight.grad.abs().sum() G_grad_first_layer = G.blocks[0][0].conv1.weight.grad.abs().sum() G_weight_gates = G.convgru.convgru.cell_list[ 0].conv_gates.weight.abs().mean() G_weight_can = G.convgru.convgru.cell_list[0].conv_can.weight.abs( ).mean() G_weight_first_layer = G.blocks[0][0].conv1.weight.abs().mean() G.optim.step() # If we have an ema, update it, regardless of if we test with it or not if config['ema']: ema.update(state_dict['itr']) if config['no_Dv'] == False: out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()), 'Dv_loss_real': float(Dv_loss_real.item()), 'Dv_loss_fake': float(Dv_loss_fake.item()) } else: out = { 'G_loss': float(G_loss.item()), 'D_loss_real': float(D_loss_real.item()), 'D_loss_fake': float(D_loss_fake.item()) } if tensor_writer != None and iteration % config[ 'log_results_every'] == 0: tensor_writer.add_video('Video Results', (G_z + 1) / 2, iteration) mean_pixel_val = torch.mean((G_z + 1) / 2, dim=[0, 1, 3, 4]) tensor_writer.add_scalar( 'Pixel vals/Mean Red Pixel values, fake data', float(mean_pixel_val[0].item()), iteration) tensor_writer.add_scalar( 'Pixel vals/Mean Green Pixel values, fake data', float(mean_pixel_val[1].item()), iteration) tensor_writer.add_scalar( 'Pixel vals/Mean Blue Pixel values, fake data', float(mean_pixel_val[2].item()), iteration) y_Gz_text = [] for yi in y_: y_Gz_text.append(idx_to_classes[yi.item()]) tensor_writer.add_text('Generated Labels', ' | '.join(y_Gz_text), iteration) # Return G's loss and the components of D's loss. if config['no_avg_pixel_loss'] == False: tensor_writer.add_scalar('Loss/avg_pixel_loss', mean_pixel_loss, iteration) tensor_writer.add_scalar('Loss/G_loss', out['G_loss'], iteration) tensor_writer.add_scalar('Loss/D_loss_real', out['D_loss_real'], iteration) tensor_writer.add_scalar('Loss/D_loss_fake', out['D_loss_fake'], iteration) if config['no_Dv'] == False: tensor_writer.add_scalar('Loss/Dv_loss_fake', out['Dv_loss_fake'], iteration) tensor_writer.add_scalar('Loss/Dv_loss_real', out['Dv_loss_real'], iteration) if config['no_convgru'] == False: tensor_writer.add_scalar('Gradient/G_grad_gates', G_grad_gates, iteration) tensor_writer.add_scalar('Gradient/G_grad_can', G_grad_can, iteration) tensor_writer.add_scalar('Gradient/G_grad_first_layer', G_grad_first_layer, iteration) tensor_writer.add_scalar('Weight/G_weight_gates', G_weight_gates, iteration) tensor_writer.add_scalar('Weight/G_weight_can', G_weight_can, iteration) tensor_writer.add_scalar('Weight/G_weight_first_layer', G_weight_first_layer, iteration) return out