def log_gpu_memory_to_tensorboard(): ''' Log every gpus current free memory level to tensorboard. ''' for i in range(nvidia_smi.nvmlDeviceGetCount()): info = nvidia_smi.nvmlDeviceGetMemoryInfo(gpus[i]) with loggers[i].as_default(): tl.summary({'free': np.array(info.free) / (1024**3)}, step=int(time.time()), name='GPUs')
def main_loop(trainer, datasets, test_iterations, config, checkpoint, samples_dir): (a_train_dataset, a_test_dataset), (b_train_dataset, b_test_dataset) = datasets optimizer_iterations = config['hyperparameters']['iterations'] c_loss_mean_dict = {} a_dataset_iter = iter(a_train_dataset) b_dataset_iter = iter(b_train_dataset) for iterations in tqdm.tqdm(range(1, optimizer_iterations + 1)): images_a, actions_a = next(a_dataset_iter) images_b, _ = next(b_dataset_iter) # Training ops D_loss_dict, G_images, G_loss_dict, C_loss_dict = trainer.joint_train_step( images_a, actions_a, images_b) for c_loss_label, c_loss in C_loss_dict.items(): if c_loss_label not in c_loss_mean_dict: c_loss_mean_dict[c_loss_label] = tf.keras.metrics.Mean() c_loss_mean_dict[c_loss_label].update_state(c_loss.numpy()) # Logging ops if iterations % config['log_iterations'] == 0: for c_loss_label, c_loss_mean in c_loss_mean_dict.items(): C_loss_dict[c_loss_label] = c_loss_mean.result() c_loss_mean.reset_states() tf2lib.summary(D_loss_dict, step=iterations, name='discriminator') tf2lib.summary(G_loss_dict, step=iterations, name='generator') tf2lib.summary(C_loss_dict, step=iterations, name='controller') # Displaying ops if iterations % config['image_save_iterations'] == 0: img_filename = os.path.join(samples_dir, f'train_{iterations}.jpg') elif iterations % config['image_display_iterations'] == 0: img_filename = os.path.join(samples_dir, f'train.jpg') else: img_filename = None if img_filename: img = imlib.immerge(np.concatenate( [row_a for row_a in images_a] + [row_b for row_b in images_b] + G_images, axis=0), n_rows=8) imlib.imwrite(img, img_filename) # Testing and checkpointing ops if iterations % config[ 'test_every_iterations'] == 0 or iterations == optimizer_iterations: C_loss_dict = test_model(trainer.model, trainer.controller, a_test_dataset, b_test_dataset, test_iterations, samples_dir) tf2lib.summary(C_loss_dict, step=iterations, name='controller') checkpoint.save(iterations)
def train(self, debug): image_buffers = [] image_buffers_test = [] for ep in tqdm.trange(self.epochs, desc='Epoch Loop'): if ep < self.ep_cnt: continue # update epoch counter self.ep_cnt.assign_add(1) # train for an epoch for A, B in tqdm.tqdm(self.A_B_dataset, desc='Inner Epoch Loop', total=self.len_dataset): G_loss_dict, D_loss_dict = self.train_step(A, B) # # summary tl.summary(G_loss_dict, step=self.G_optimizer.iterations, name='G_losses') tl.summary(D_loss_dict, step=self.G_optimizer.iterations, name='D_losses') tl.summary( { 'learning rate': self.G_lr_scheduler.current_learning_rate }, step=self.G_optimizer.iterations, name='learning rate') # sample snapshot_period = min(10, (self.epochs // 70) + 1) if self.G_optimizer.iterations.numpy() % snapshot_period == 0: image_buffers.append( self.snapshot(A, B, 'train_iter-%09d.jpg', debug=debug)) A_test, B_test = next(self.test_iter) image_buffers_test.append( self.snapshot(A_test, B_test, 'test_iter-%09d.jpg', debug=debug)) # save checkpoint self.checkpoint.save(ep) if image_buffers: image_buffers.extend(image_buffers_test) make_animation(image_buffers, 'animations/cycleGAN')
continue # update epoch counter ep_cnt.assign_add(1) # train for an epoch # Comment by K.C: # train the discriminator based on the real image for x_real in tqdm.tqdm(dataset, desc='Inner Epoch Loop', total=len_dataset): # Comment by K.C: # run train_D means to update D once, D_loss can be printed here. D_loss_dict = train_D(x_real) tl.summary(D_loss_dict, step=D_optimizer.iterations, name='D_losses') # Comment by K.C: # Update the Discriminator for every n_d run of the Generator if D_optimizer.iterations.numpy() % args.n_d == 0: G_loss_dict = train_G() tl.summary(G_loss_dict, step=G_optimizer.iterations, name='G_losses') # sample if G_optimizer.iterations.numpy() % 100 == 0: x_fake = sample(z) img = im.immerge(x_fake, n_rows=10) im.imwrite(img, py.join(sample_dir, 'iter-%09d.jpg' % G_optimizer.iterations.numpy())) # Added by K.C: update the mean loss functions every 100 iterations, and plot them out D_loss_summary.append(D_loss_dict.get('d_loss','').numpy()) D_GP_summary.append(D_loss_dict.get('gp', '').numpy())
for ep in tqdm.trange(args.epochs, desc='Epoch Loop'): if ep < ep_cnt: continue # update epoch counter ep_cnt.assign_add(1) # train for an epoch for A, B in tqdm.tqdm(A_B_dataset, desc='Inner Epoch Loop', total=len_dataset): G_loss_dict, D_loss_dict = train_step(A, B) # # summary tl.summary(G_loss_dict, step=G_optimizer.iterations, name='G_losses') tl.summary(D_loss_dict, step=G_optimizer.iterations, name='D_losses') tl.summary({'learning rate': G_lr_scheduler.current_learning_rate}, step=G_optimizer.iterations, name='learning rate') # sample if G_optimizer.iterations.numpy() % 100 == 0: A, B = next(test_iter) A2B, B2A, A2B2A, B2A2B = sample(A, B) img = im.immerge(np.concatenate([A, A2B, A2B2A, B, B2A, B2A2B], axis=0), n_rows=2)
def train_CycleGAN(): import logGPU_RAM # summary train_summary_writer = tf.summary.create_file_writer( py.join(output_dir, 'summaries', 'train')) logGPU_RAM.init_gpu_writers(py.join(output_dir, 'summaries', 'GPUs')) # sample test_iter = iter(A_B_dataset_test) sample_dir = py.join(output_dir, 'samples_training') py.mkdir(sample_dir) test_sample = next(test_iter) # timeing import time start_time = time.time() # main loop with train_summary_writer.as_default(): for ep in tqdm.trange(args.epochs, desc='Epoch Loop'): if ep < ep_cnt: continue # update epoch counter ep_cnt.assign_add(1) # train for an epoch for A, B in tqdm.tqdm(A_B_dataset, desc='Inner Epoch Loop', total=len_dataset): G_loss_dict, D_loss_dict = train_step(A, B) iteration = G_optimizer.iterations.numpy() # # summary tl.summary(G_loss_dict, step=iteration, name='G_losses') tl.summary(D_loss_dict, step=iteration, name='D_losses') tl.summary( {'learning rate': G_lr_scheduler.current_learning_rate}, step=iteration, name='learning rate') tl.summary( {'second since start': np.array(time.time() - start_time)}, step=iteration, name='second_Per_Iteration') logGPU_RAM.log_gpu_memory_to_tensorboard() # sample if iteration % 1000 == 0: A, B = next(test_iter) A2B, B2A, A2B2A, B2A2B = sample(A, B) img = im.immerge(np.concatenate( [A, A2B, A2B2A, B, B2A, B2A2B], axis=0), n_rows=2) im.imwrite( img, py.join(sample_dir, 'iter-%09d-sample-test-random.jpg' % iteration)) if iteration % 100 == 0: A, B = test_sample A2B, B2A, A2B2A, B2A2B = sample(A, B) img = im.immerge(np.concatenate( [A, A2B, A2B2A, B, B2A, B2A2B], axis=0), n_rows=2) im.imwrite( img, py.join( sample_dir, 'iter-%09d-sample-test-specific.jpg' % iteration)) # save checkpoint checkpoint.save(ep)
# main loop with train_summary_writer.as_default(): for ep in range(start_epoch, opt.niter + opt.niter_decay + 1): print('Epoch: ', ep) for step, (label, real_img) in enumerate(dataset): input_label, inst_map, real_image, feat_map = model.encode_input(label) # inputs = (input_label, inst_map, real_image, feat_map) loss_D_dict, loss_G_dict, fake_img = train_step(input_label, real_img) if not opt.no_normalize_img: fake_img = fake_img * 0.5 + 0.5 fake_img = fake_img * 255 fake_img = tf.cast(fake_img, tf.uint8) if (step+1) % opt.display_freq == 0: tl.summary(loss_G_dict, step=optimizer_G.iterations, name='G_losses') tl.summary(loss_D_dict, step=optimizer_G.iterations, name='D_losses') tl.summary({'learning rate': lr_scheduler_G.current_learning_rate}, step=optimizer_G.iterations, name='learning rate') tl.summary({'gen_img': fake_img}, step=optimizer_G.iterations, types=['image'], name='image_generated') if (ep+1) % opt.save_epoch_freq == 0: checkpoint.save() if opt.savedModel_output: tf.saved_model.save(model.netG, os.path.join(opt.checkpoints_dir, opt.name, 'net_G_savedModel'))
def train(): # ===================================== Args ===================================== args = parse_args() output_dir = os.path.join('output', args.dataset) os.makedirs(output_dir, exist_ok=True) settings_path = os.path.join(output_dir, 'settings.json') pylib.args_to_json(settings_path, args) # ===================================== Data ===================================== A_img_paths = pylib.glob( os.path.join(args.datasets_dir, args.dataset, 'trainA'), '*.png') B_img_paths = pylib.glob( os.path.join(args.datasets_dir, args.dataset, 'trainB'), '*.png') print(f'len(A_img_paths) = {len(A_img_paths)}') print(f'len(B_img_paths) = {len(B_img_paths)}') load_size = [args.load_size_height, args.load_size_width] crop_size = [args.crop_size_height, args.crop_size_width] A_B_dataset, len_dataset = data.make_zip_dataset(A_img_paths, B_img_paths, args.batch_size, load_size, crop_size, training=True, repeat=False) A2B_pool = data.ItemPool(args.pool_size) B2A_pool = data.ItemPool(args.pool_size) A_img_paths_test = pylib.glob( os.path.join(args.datasets_dir, args.dataset, 'testA'), '*.png') B_img_paths_test = pylib.glob( os.path.join(args.datasets_dir, args.dataset, 'testB'), '*.png') A_B_dataset_test, _ = data.make_zip_dataset(A_img_paths_test, B_img_paths_test, args.batch_size, load_size, crop_size, training=False, repeat=True) # ===================================== Models ===================================== model_input_shape = crop_size + [ 3 ] # [args.crop_size_height, args.crop_size_width, 3] G_A2B = module.ResnetGenerator(input_shape=model_input_shape, n_blocks=6) G_B2A = module.ResnetGenerator(input_shape=model_input_shape, n_blocks=6) D_A = module.ConvDiscriminator(input_shape=model_input_shape) D_B = module.ConvDiscriminator(input_shape=model_input_shape) d_loss_fn, g_loss_fn = tf2gan.get_adversarial_losses_fn( args.adversarial_loss_mode) cycle_loss_fn = tf.losses.MeanAbsoluteError() identity_loss_fn = tf.losses.MeanAbsoluteError() G_lr_scheduler = module.LinearDecay(args.lr, args.epochs * len_dataset, args.epoch_decay * len_dataset) D_lr_scheduler = module.LinearDecay(args.lr, args.epochs * len_dataset, args.epoch_decay * len_dataset) G_optimizer = tf.keras.optimizers.Adam(learning_rate=G_lr_scheduler, beta_1=args.beta_1) D_optimizer = tf.keras.optimizers.Adam(learning_rate=D_lr_scheduler, beta_1=args.beta_1) # ===================================== Training steps ===================================== @tf.function def train_generators(A, B): with tf.GradientTape() as t: A2B = G_A2B(A, training=True) B2A = G_B2A(B, training=True) A2B2A = G_B2A(A2B, training=True) B2A2B = G_A2B(B2A, training=True) A2A = G_B2A(A, training=True) B2B = G_A2B(B, training=True) A2B_d_logits = D_B(A2B, training=True) B2A_d_logits = D_A(B2A, training=True) A2B_g_loss = g_loss_fn(A2B_d_logits) B2A_g_loss = g_loss_fn(B2A_d_logits) A2B2A_cycle_loss = cycle_loss_fn(A, A2B2A) B2A2B_cycle_loss = cycle_loss_fn(B, B2A2B) A2A_id_loss = identity_loss_fn(A, A2A) B2B_id_loss = identity_loss_fn(B, B2B) G_loss = (A2B_g_loss + B2A_g_loss) + ( A2B2A_cycle_loss + B2A2B_cycle_loss) * args.cycle_loss_weight + ( A2A_id_loss + B2B_id_loss) * args.identity_loss_weight G_grad = t.gradient( G_loss, G_A2B.trainable_variables + G_B2A.trainable_variables) G_optimizer.apply_gradients( zip(G_grad, G_A2B.trainable_variables + G_B2A.trainable_variables)) return A2B, B2A, { 'A2B_g_loss': A2B_g_loss, 'B2A_g_loss': B2A_g_loss, 'A2B2A_cycle_loss': A2B2A_cycle_loss, 'B2A2B_cycle_loss': B2A2B_cycle_loss, 'A2A_id_loss': A2A_id_loss, 'B2B_id_loss': B2B_id_loss } @tf.function def train_discriminators(A, B, A2B, B2A): with tf.GradientTape() as t: A_d_logits = D_A(A, training=True) B2A_d_logits = D_A(B2A, training=True) B_d_logits = D_B(B, training=True) A2B_d_logits = D_B(A2B, training=True) A_d_loss, B2A_d_loss = d_loss_fn(A_d_logits, B2A_d_logits) B_d_loss, A2B_d_loss = d_loss_fn(B_d_logits, A2B_d_logits) D_A_gp = tf2gan.gradient_penalty(functools.partial(D_A, training=True), A, B2A, mode=args.gradient_penalty_mode) D_B_gp = tf2gan.gradient_penalty(functools.partial(D_B, training=True), B, A2B, mode=args.gradient_penalty_mode) D_loss = (A_d_loss + B2A_d_loss) + (B_d_loss + A2B_d_loss) + ( D_A_gp + D_B_gp) * args.gradient_penalty_weight D_grad = t.gradient(D_loss, D_A.trainable_variables + D_B.trainable_variables) D_optimizer.apply_gradients( zip(D_grad, D_A.trainable_variables + D_B.trainable_variables)) return { 'A_d_loss': A_d_loss + B2A_d_loss, 'B_d_loss': B_d_loss + A2B_d_loss, 'D_A_gp': D_A_gp, 'D_B_gp': D_B_gp } def train_step(A, B): A2B, B2A, G_loss_dict = train_generators(A, B) # cannot autograph `A2B_pool` A2B = A2B_pool( A2B) # or A2B = A2B_pool(A2B.numpy()), but it is much slower B2A = B2A_pool(B2A) # because of the communication between CPU and GPU D_loss_dict = train_discriminators(A, B, A2B, B2A) return G_loss_dict, D_loss_dict @tf.function def sample(A, B): A2B = G_A2B(A, training=False) B2A = G_B2A(B, training=False) A2B2A = G_B2A(A2B, training=False) B2A2B = G_A2B(B2A, training=False) return A2B, B2A, A2B2A, B2A2B # ===================================== Runner code ===================================== # epoch counter ep_cnt = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64) # checkpoint checkpoint = tf2lib.Checkpoint(dict(G_A2B=G_A2B, G_B2A=G_B2A, D_A=D_A, D_B=D_B, G_optimizer=G_optimizer, D_optimizer=D_optimizer, ep_cnt=ep_cnt), os.path.join(output_dir, 'checkpoints'), max_to_keep=5) try: # restore checkpoint including the epoch counter checkpoint.restore().assert_existing_objects_matched() except Exception as e: print(e) # summary train_summary_writer = tf.summary.create_file_writer( os.path.join(output_dir, 'summaries', 'train')) # sample test_iter = iter(A_B_dataset_test) sample_dir = os.path.join(output_dir, 'samples_training') os.makedirs(sample_dir, exist_ok=True) # main loop with train_summary_writer.as_default(): for ep in tqdm.trange(args.epochs, desc='Epoch Loop'): if ep < ep_cnt: continue # update epoch counter ep_cnt.assign_add(1) # train for an epoch for A, B in tqdm.tqdm(A_B_dataset, desc='Inner Epoch Loop', total=len_dataset): G_loss_dict, D_loss_dict = train_step(A, B) # # summary tf2lib.summary(G_loss_dict, step=G_optimizer.iterations, name='G_losses') tf2lib.summary(D_loss_dict, step=G_optimizer.iterations, name='D_losses') tf2lib.summary( {'learning rate': G_lr_scheduler.current_learning_rate}, step=G_optimizer.iterations, name='learning rate') # sample if G_optimizer.iterations.numpy() % 100 == 0: A, B = next(test_iter) A2B, B2A, A2B2A, B2A2B = sample(A, B) img = imlib.immerge(np.concatenate( [A, A2B, A2B2A, B, B2A, B2A2B], axis=0), n_rows=6) imlib.imwrite( img, os.path.join( sample_dir, 'iter-%09d.jpg' % G_optimizer.iterations.numpy())) # save checkpoint checkpoint.save(ep)
if ep < ep_cnt: continue # update epoch counter ep_cnt.assign_add(1) # train for an epoch for A, B in tqdm.tqdm(A_B_dataset, desc="Inner Epoch Loop", total=len_dataset): G_loss_dict, D_loss_dict = train_step(A, B) # # summary tl.summary(G_loss_dict, step=G_optimizer.iterations, name="G_losses") tl.summary(D_loss_dict, step=G_optimizer.iterations, name="D_losses") tl.summary( {"learning rate": G_lr_scheduler.current_learning_rate}, step=G_optimizer.iterations, name="learning rate", ) # sample if G_optimizer.iterations.numpy() % 100 == 0: A, B = next(test_iter) if args.bidirectional: if args.DnCNN is not None: