def log_gpu_memory_to_tensorboard():
    '''
    Log every gpus current free memory level to tensorboard.
    '''
    for i in range(nvidia_smi.nvmlDeviceGetCount()):
        info = nvidia_smi.nvmlDeviceGetMemoryInfo(gpus[i])
        with loggers[i].as_default():
            tl.summary({'free': np.array(info.free) / (1024**3)},
                       step=int(time.time()),
                       name='GPUs')
Example #2
0
def main_loop(trainer, datasets, test_iterations, config, checkpoint,
              samples_dir):
    (a_train_dataset, a_test_dataset), (b_train_dataset,
                                        b_test_dataset) = datasets
    optimizer_iterations = config['hyperparameters']['iterations']
    c_loss_mean_dict = {}
    a_dataset_iter = iter(a_train_dataset)
    b_dataset_iter = iter(b_train_dataset)
    for iterations in tqdm.tqdm(range(1, optimizer_iterations + 1)):
        images_a, actions_a = next(a_dataset_iter)
        images_b, _ = next(b_dataset_iter)

        # Training ops
        D_loss_dict, G_images, G_loss_dict, C_loss_dict = trainer.joint_train_step(
            images_a, actions_a, images_b)
        for c_loss_label, c_loss in C_loss_dict.items():
            if c_loss_label not in c_loss_mean_dict:
                c_loss_mean_dict[c_loss_label] = tf.keras.metrics.Mean()
            c_loss_mean_dict[c_loss_label].update_state(c_loss.numpy())

        # Logging ops
        if iterations % config['log_iterations'] == 0:
            for c_loss_label, c_loss_mean in c_loss_mean_dict.items():
                C_loss_dict[c_loss_label] = c_loss_mean.result()
                c_loss_mean.reset_states()
            tf2lib.summary(D_loss_dict, step=iterations, name='discriminator')
            tf2lib.summary(G_loss_dict, step=iterations, name='generator')
            tf2lib.summary(C_loss_dict, step=iterations, name='controller')
        # Displaying ops
        if iterations % config['image_save_iterations'] == 0:
            img_filename = os.path.join(samples_dir, f'train_{iterations}.jpg')
        elif iterations % config['image_display_iterations'] == 0:
            img_filename = os.path.join(samples_dir, f'train.jpg')
        else:
            img_filename = None
        if img_filename:
            img = imlib.immerge(np.concatenate(
                [row_a
                 for row_a in images_a] + [row_b
                                           for row_b in images_b] + G_images,
                axis=0),
                                n_rows=8)
            imlib.imwrite(img, img_filename)
        # Testing and checkpointing ops
        if iterations % config[
                'test_every_iterations'] == 0 or iterations == optimizer_iterations:
            C_loss_dict = test_model(trainer.model, trainer.controller,
                                     a_test_dataset, b_test_dataset,
                                     test_iterations, samples_dir)
            tf2lib.summary(C_loss_dict, step=iterations, name='controller')
            checkpoint.save(iterations)
Example #3
0
    def train(self, debug):
        image_buffers = []
        image_buffers_test = []
        for ep in tqdm.trange(self.epochs, desc='Epoch Loop'):
            if ep < self.ep_cnt:
                continue

            # update epoch counter
            self.ep_cnt.assign_add(1)

            # train for an epoch
            for A, B in tqdm.tqdm(self.A_B_dataset,
                                  desc='Inner Epoch Loop',
                                  total=self.len_dataset):
                G_loss_dict, D_loss_dict = self.train_step(A, B)

                # # summary
                tl.summary(G_loss_dict,
                           step=self.G_optimizer.iterations,
                           name='G_losses')
                tl.summary(D_loss_dict,
                           step=self.G_optimizer.iterations,
                           name='D_losses')
                tl.summary(
                    {
                        'learning rate':
                        self.G_lr_scheduler.current_learning_rate
                    },
                    step=self.G_optimizer.iterations,
                    name='learning rate')

                # sample
                snapshot_period = min(10, (self.epochs // 70) + 1)
                if self.G_optimizer.iterations.numpy() % snapshot_period == 0:
                    image_buffers.append(
                        self.snapshot(A, B, 'train_iter-%09d.jpg',
                                      debug=debug))
                    A_test, B_test = next(self.test_iter)
                    image_buffers_test.append(
                        self.snapshot(A_test,
                                      B_test,
                                      'test_iter-%09d.jpg',
                                      debug=debug))

            # save checkpoint
            self.checkpoint.save(ep)
        if image_buffers:
            image_buffers.extend(image_buffers_test)
            make_animation(image_buffers, 'animations/cycleGAN')
            continue

        # update epoch counter
        ep_cnt.assign_add(1)

        # train for an epoch

        # Comment by K.C:
        # train the discriminator based on the real image


        for x_real in tqdm.tqdm(dataset, desc='Inner Epoch Loop', total=len_dataset):
            # Comment by K.C:
            # run train_D means to update D once, D_loss can be printed here.
            D_loss_dict = train_D(x_real)
            tl.summary(D_loss_dict, step=D_optimizer.iterations, name='D_losses')
            # Comment by K.C:
            # Update the Discriminator for every n_d run of the Generator
            if D_optimizer.iterations.numpy() % args.n_d == 0:
                G_loss_dict = train_G()
                tl.summary(G_loss_dict, step=G_optimizer.iterations, name='G_losses')

            # sample
            if G_optimizer.iterations.numpy() % 100 == 0:
                x_fake = sample(z)
                img = im.immerge(x_fake, n_rows=10)
                im.imwrite(img, py.join(sample_dir, 'iter-%09d.jpg' % G_optimizer.iterations.numpy()))

                # Added by K.C: update the mean loss functions every 100 iterations, and plot them out
                D_loss_summary.append(D_loss_dict.get('d_loss','').numpy())
                D_GP_summary.append(D_loss_dict.get('gp', '').numpy())
Example #5
0
    for ep in tqdm.trange(args.epochs, desc='Epoch Loop'):
        if ep < ep_cnt:
            continue

        # update epoch counter
        ep_cnt.assign_add(1)

        # train for an epoch
        for A, B in tqdm.tqdm(A_B_dataset,
                              desc='Inner Epoch Loop',
                              total=len_dataset):
            G_loss_dict, D_loss_dict = train_step(A, B)

            # # summary
            tl.summary(G_loss_dict,
                       step=G_optimizer.iterations,
                       name='G_losses')
            tl.summary(D_loss_dict,
                       step=G_optimizer.iterations,
                       name='D_losses')
            tl.summary({'learning rate': G_lr_scheduler.current_learning_rate},
                       step=G_optimizer.iterations,
                       name='learning rate')

            # sample
            if G_optimizer.iterations.numpy() % 100 == 0:
                A, B = next(test_iter)
                A2B, B2A, A2B2A, B2A2B = sample(A, B)
                img = im.immerge(np.concatenate([A, A2B, A2B2A, B, B2A, B2A2B],
                                                axis=0),
                                 n_rows=2)
def train_CycleGAN():

    import logGPU_RAM

    # summary
    train_summary_writer = tf.summary.create_file_writer(
        py.join(output_dir, 'summaries', 'train'))
    logGPU_RAM.init_gpu_writers(py.join(output_dir, 'summaries', 'GPUs'))

    # sample
    test_iter = iter(A_B_dataset_test)
    sample_dir = py.join(output_dir, 'samples_training')
    py.mkdir(sample_dir)

    test_sample = next(test_iter)

    # timeing
    import time
    start_time = time.time()

    # main loop
    with train_summary_writer.as_default():
        for ep in tqdm.trange(args.epochs, desc='Epoch Loop'):
            if ep < ep_cnt:
                continue

            # update epoch counter
            ep_cnt.assign_add(1)

            # train for an epoch
            for A, B in tqdm.tqdm(A_B_dataset,
                                  desc='Inner Epoch Loop',
                                  total=len_dataset):
                G_loss_dict, D_loss_dict = train_step(A, B)

                iteration = G_optimizer.iterations.numpy()

                # # summary
                tl.summary(G_loss_dict, step=iteration, name='G_losses')
                tl.summary(D_loss_dict, step=iteration, name='D_losses')
                tl.summary(
                    {'learning rate': G_lr_scheduler.current_learning_rate},
                    step=iteration,
                    name='learning rate')
                tl.summary(
                    {'second since start': np.array(time.time() - start_time)},
                    step=iteration,
                    name='second_Per_Iteration')
                logGPU_RAM.log_gpu_memory_to_tensorboard()

                # sample
                if iteration % 1000 == 0:
                    A, B = next(test_iter)
                    A2B, B2A, A2B2A, B2A2B = sample(A, B)
                    img = im.immerge(np.concatenate(
                        [A, A2B, A2B2A, B, B2A, B2A2B], axis=0),
                                     n_rows=2)
                    im.imwrite(
                        img,
                        py.join(sample_dir,
                                'iter-%09d-sample-test-random.jpg' %
                                iteration))
                if iteration % 100 == 0:
                    A, B = test_sample
                    A2B, B2A, A2B2A, B2A2B = sample(A, B)
                    img = im.immerge(np.concatenate(
                        [A, A2B, A2B2A, B, B2A, B2A2B], axis=0),
                                     n_rows=2)
                    im.imwrite(
                        img,
                        py.join(
                            sample_dir,
                            'iter-%09d-sample-test-specific.jpg' % iteration))
            # save checkpoint
            checkpoint.save(ep)
# main loop
with train_summary_writer.as_default():
    for ep in range(start_epoch, opt.niter + opt.niter_decay + 1):
        print('Epoch: ', ep)
        for step, (label, real_img) in enumerate(dataset):
            input_label, inst_map, real_image, feat_map = model.encode_input(label)
            # inputs = (input_label, inst_map, real_image, feat_map)
            loss_D_dict, loss_G_dict, fake_img = train_step(input_label, real_img)
            if not opt.no_normalize_img:
                fake_img = fake_img * 0.5 + 0.5
                fake_img = fake_img * 255
            fake_img = tf.cast(fake_img, tf.uint8)

            if (step+1) % opt.display_freq == 0:
                tl.summary(loss_G_dict, step=optimizer_G.iterations, name='G_losses')
                tl.summary(loss_D_dict, step=optimizer_G.iterations, name='D_losses')
                tl.summary({'learning rate': lr_scheduler_G.current_learning_rate},
                           step=optimizer_G.iterations,
                           name='learning rate')
                tl.summary({'gen_img': fake_img},
                           step=optimizer_G.iterations,
                           types=['image'],
                           name='image_generated')

        if (ep+1) % opt.save_epoch_freq == 0:
            checkpoint.save()

if opt.savedModel_output:
    tf.saved_model.save(model.netG, os.path.join(opt.checkpoints_dir,
                                                 opt.name, 'net_G_savedModel'))
Example #8
0
def train():
    # ===================================== Args =====================================
    args = parse_args()
    output_dir = os.path.join('output', args.dataset)
    os.makedirs(output_dir, exist_ok=True)
    settings_path = os.path.join(output_dir, 'settings.json')
    pylib.args_to_json(settings_path, args)

    # ===================================== Data =====================================
    A_img_paths = pylib.glob(
        os.path.join(args.datasets_dir, args.dataset, 'trainA'), '*.png')
    B_img_paths = pylib.glob(
        os.path.join(args.datasets_dir, args.dataset, 'trainB'), '*.png')
    print(f'len(A_img_paths) = {len(A_img_paths)}')
    print(f'len(B_img_paths) = {len(B_img_paths)}')
    load_size = [args.load_size_height, args.load_size_width]
    crop_size = [args.crop_size_height, args.crop_size_width]
    A_B_dataset, len_dataset = data.make_zip_dataset(A_img_paths,
                                                     B_img_paths,
                                                     args.batch_size,
                                                     load_size,
                                                     crop_size,
                                                     training=True,
                                                     repeat=False)

    A2B_pool = data.ItemPool(args.pool_size)
    B2A_pool = data.ItemPool(args.pool_size)

    A_img_paths_test = pylib.glob(
        os.path.join(args.datasets_dir, args.dataset, 'testA'), '*.png')
    B_img_paths_test = pylib.glob(
        os.path.join(args.datasets_dir, args.dataset, 'testB'), '*.png')
    A_B_dataset_test, _ = data.make_zip_dataset(A_img_paths_test,
                                                B_img_paths_test,
                                                args.batch_size,
                                                load_size,
                                                crop_size,
                                                training=False,
                                                repeat=True)

    # ===================================== Models =====================================
    model_input_shape = crop_size + [
        3
    ]  # [args.crop_size_height, args.crop_size_width, 3]

    G_A2B = module.ResnetGenerator(input_shape=model_input_shape, n_blocks=6)
    G_B2A = module.ResnetGenerator(input_shape=model_input_shape, n_blocks=6)

    D_A = module.ConvDiscriminator(input_shape=model_input_shape)
    D_B = module.ConvDiscriminator(input_shape=model_input_shape)

    d_loss_fn, g_loss_fn = tf2gan.get_adversarial_losses_fn(
        args.adversarial_loss_mode)
    cycle_loss_fn = tf.losses.MeanAbsoluteError()
    identity_loss_fn = tf.losses.MeanAbsoluteError()

    G_lr_scheduler = module.LinearDecay(args.lr, args.epochs * len_dataset,
                                        args.epoch_decay * len_dataset)
    D_lr_scheduler = module.LinearDecay(args.lr, args.epochs * len_dataset,
                                        args.epoch_decay * len_dataset)
    G_optimizer = tf.keras.optimizers.Adam(learning_rate=G_lr_scheduler,
                                           beta_1=args.beta_1)
    D_optimizer = tf.keras.optimizers.Adam(learning_rate=D_lr_scheduler,
                                           beta_1=args.beta_1)

    # ===================================== Training steps =====================================
    @tf.function
    def train_generators(A, B):
        with tf.GradientTape() as t:
            A2B = G_A2B(A, training=True)
            B2A = G_B2A(B, training=True)
            A2B2A = G_B2A(A2B, training=True)
            B2A2B = G_A2B(B2A, training=True)
            A2A = G_B2A(A, training=True)
            B2B = G_A2B(B, training=True)

            A2B_d_logits = D_B(A2B, training=True)
            B2A_d_logits = D_A(B2A, training=True)

            A2B_g_loss = g_loss_fn(A2B_d_logits)
            B2A_g_loss = g_loss_fn(B2A_d_logits)
            A2B2A_cycle_loss = cycle_loss_fn(A, A2B2A)
            B2A2B_cycle_loss = cycle_loss_fn(B, B2A2B)
            A2A_id_loss = identity_loss_fn(A, A2A)
            B2B_id_loss = identity_loss_fn(B, B2B)

            G_loss = (A2B_g_loss + B2A_g_loss) + (
                A2B2A_cycle_loss +
                B2A2B_cycle_loss) * args.cycle_loss_weight + (
                    A2A_id_loss + B2B_id_loss) * args.identity_loss_weight

        G_grad = t.gradient(
            G_loss, G_A2B.trainable_variables + G_B2A.trainable_variables)
        G_optimizer.apply_gradients(
            zip(G_grad, G_A2B.trainable_variables + G_B2A.trainable_variables))

        return A2B, B2A, {
            'A2B_g_loss': A2B_g_loss,
            'B2A_g_loss': B2A_g_loss,
            'A2B2A_cycle_loss': A2B2A_cycle_loss,
            'B2A2B_cycle_loss': B2A2B_cycle_loss,
            'A2A_id_loss': A2A_id_loss,
            'B2B_id_loss': B2B_id_loss
        }

    @tf.function
    def train_discriminators(A, B, A2B, B2A):
        with tf.GradientTape() as t:
            A_d_logits = D_A(A, training=True)
            B2A_d_logits = D_A(B2A, training=True)
            B_d_logits = D_B(B, training=True)
            A2B_d_logits = D_B(A2B, training=True)

            A_d_loss, B2A_d_loss = d_loss_fn(A_d_logits, B2A_d_logits)
            B_d_loss, A2B_d_loss = d_loss_fn(B_d_logits, A2B_d_logits)
            D_A_gp = tf2gan.gradient_penalty(functools.partial(D_A,
                                                               training=True),
                                             A,
                                             B2A,
                                             mode=args.gradient_penalty_mode)
            D_B_gp = tf2gan.gradient_penalty(functools.partial(D_B,
                                                               training=True),
                                             B,
                                             A2B,
                                             mode=args.gradient_penalty_mode)

            D_loss = (A_d_loss + B2A_d_loss) + (B_d_loss + A2B_d_loss) + (
                D_A_gp + D_B_gp) * args.gradient_penalty_weight

        D_grad = t.gradient(D_loss,
                            D_A.trainable_variables + D_B.trainable_variables)
        D_optimizer.apply_gradients(
            zip(D_grad, D_A.trainable_variables + D_B.trainable_variables))

        return {
            'A_d_loss': A_d_loss + B2A_d_loss,
            'B_d_loss': B_d_loss + A2B_d_loss,
            'D_A_gp': D_A_gp,
            'D_B_gp': D_B_gp
        }

    def train_step(A, B):
        A2B, B2A, G_loss_dict = train_generators(A, B)

        # cannot autograph `A2B_pool`
        A2B = A2B_pool(
            A2B)  # or A2B = A2B_pool(A2B.numpy()), but it is much slower
        B2A = B2A_pool(B2A)  # because of the communication between CPU and GPU

        D_loss_dict = train_discriminators(A, B, A2B, B2A)

        return G_loss_dict, D_loss_dict

    @tf.function
    def sample(A, B):
        A2B = G_A2B(A, training=False)
        B2A = G_B2A(B, training=False)
        A2B2A = G_B2A(A2B, training=False)
        B2A2B = G_A2B(B2A, training=False)
        return A2B, B2A, A2B2A, B2A2B

    # ===================================== Runner code =====================================
    # epoch counter
    ep_cnt = tf.Variable(initial_value=0, trainable=False, dtype=tf.int64)

    # checkpoint
    checkpoint = tf2lib.Checkpoint(dict(G_A2B=G_A2B,
                                        G_B2A=G_B2A,
                                        D_A=D_A,
                                        D_B=D_B,
                                        G_optimizer=G_optimizer,
                                        D_optimizer=D_optimizer,
                                        ep_cnt=ep_cnt),
                                   os.path.join(output_dir, 'checkpoints'),
                                   max_to_keep=5)
    try:  # restore checkpoint including the epoch counter
        checkpoint.restore().assert_existing_objects_matched()
    except Exception as e:
        print(e)

    # summary
    train_summary_writer = tf.summary.create_file_writer(
        os.path.join(output_dir, 'summaries', 'train'))

    # sample
    test_iter = iter(A_B_dataset_test)
    sample_dir = os.path.join(output_dir, 'samples_training')
    os.makedirs(sample_dir, exist_ok=True)

    # main loop
    with train_summary_writer.as_default():
        for ep in tqdm.trange(args.epochs, desc='Epoch Loop'):
            if ep < ep_cnt:
                continue

            # update epoch counter
            ep_cnt.assign_add(1)

            # train for an epoch
            for A, B in tqdm.tqdm(A_B_dataset,
                                  desc='Inner Epoch Loop',
                                  total=len_dataset):
                G_loss_dict, D_loss_dict = train_step(A, B)

                # # summary
                tf2lib.summary(G_loss_dict,
                               step=G_optimizer.iterations,
                               name='G_losses')
                tf2lib.summary(D_loss_dict,
                               step=G_optimizer.iterations,
                               name='D_losses')
                tf2lib.summary(
                    {'learning rate': G_lr_scheduler.current_learning_rate},
                    step=G_optimizer.iterations,
                    name='learning rate')

                # sample
                if G_optimizer.iterations.numpy() % 100 == 0:
                    A, B = next(test_iter)
                    A2B, B2A, A2B2A, B2A2B = sample(A, B)
                    img = imlib.immerge(np.concatenate(
                        [A, A2B, A2B2A, B, B2A, B2A2B], axis=0),
                                        n_rows=6)
                    imlib.imwrite(
                        img,
                        os.path.join(
                            sample_dir,
                            'iter-%09d.jpg' % G_optimizer.iterations.numpy()))

            # save checkpoint
            checkpoint.save(ep)
Example #9
0
        if ep < ep_cnt:
            continue

        # update epoch counter
        ep_cnt.assign_add(1)

        # train for an epoch
        for A, B in tqdm.tqdm(A_B_dataset,
                              desc="Inner Epoch Loop",
                              total=len_dataset):

            G_loss_dict, D_loss_dict = train_step(A, B)

            # # summary
            tl.summary(G_loss_dict,
                       step=G_optimizer.iterations,
                       name="G_losses")
            tl.summary(D_loss_dict,
                       step=G_optimizer.iterations,
                       name="D_losses")
            tl.summary(
                {"learning rate": G_lr_scheduler.current_learning_rate},
                step=G_optimizer.iterations,
                name="learning rate",
            )

            # sample
            if G_optimizer.iterations.numpy() % 100 == 0:
                A, B = next(test_iter)
                if args.bidirectional:
                    if args.DnCNN is not None: