def forward(ctx, x, z_tiled, gan, rec_iters, rec_rr, w_lpips, optimizer,
                schedular):
        # we assume z is already tiled based on rec_rr
        assert x.size(0) * rec_rr == z_tiled.size(0)

        # important !!! avoid modifying the gradient of input in-place
        x = x.detach().clone()
        batch_size = x.size(0)
        x_tiled = torch.repeat_interleave(x, rec_rr, dim=0)

        noise_ramp_length = 0.75 * rec_iters

        start_time = time.time()
        with torch.enable_grad():
            for i in range(rec_iters):
                optimizer.zero_grad()

                noise_ramp_factor = (
                    1 -
                    i / noise_ramp_length)**2 if i <= noise_ramp_length else 0
                noise = torch.randn_like(z_tiled) * 0.05 * noise_ramp_factor

                gen = gan(z_tiled)

                loss_tiled = (gen - x_tiled).pow(2).mean(dim=(1, 2, 3))
                perceptual_loss = lpips(gen, x_tiled)
                # loss_tiled = l1_loss(gan(z_tiled), x_tiled).mean(dim=(1, 2, 3))
                # --- IMPORTANT ---
                # different from normal network training where a fixed set of parameters are updated,
                # in this case, each instance in a batch has its own parameters to update,
                # therefore, the total loss should not be averaged over number of instances
                loss = loss_tiled.sum() + w_lpips * perceptual_loss.sum()
                loss_logging = per_pixel_l2_dist(gen, x_tiled)
                loss.backward()
                optimizer.step()

                if (i + 1) % 50 == 0:
                    print(f'===> Iter: {i+1} | '
                          f'PP-L2: {loss_logging.item():.6f} | '
                          f'Perceptual: {perceptual_loss.mean().item():.6f} | '
                          f'Time: {time.time()-start_time:.3f}')
                    start_time = time.time()

                schedular.step()

        gen = gan(z_tiled)
        loss_tiled = (gen - x_tiled).pow(2).mean(
            dim=(1, 2, 3)) + w_lpips * lpips(gen, x_tiled)
        loss_tiled = loss_tiled.view(-1, rec_rr)  # (B, r)
        indices = torch.argmin(loss_tiled, dim=1)
        offsets = torch.arange(0, batch_size).cuda() * rec_rr

        return gen[indices + offsets].detach().clone(), z_tiled[
            indices + offsets].detach().clone()
    def forward(ctx, x, z_tiled, gan, rec_iters, rec_rr, w_lpips, optimizer, schedular, latent_std):
        # we assume z is already tiled based on rec_rr
        assert x.size(0) * rec_rr == z_tiled.size(0)

        # important !!! avoid modifying the gradient of input in-place
        x = x.detach().clone()
        batch_size = x.size(0)
        x_tiled = torch.repeat_interleave(x, rec_rr, dim=0)
        # noise = torch.randn_like(x_tiled) * 0.5
        # x_tiled = x_tiled + noise

        start_time = time.time()
        with torch.enable_grad():
            for i in range(rec_iters):
                t = i / rec_iters
                lr = get_lr(t, 0.1)
                optimizer.param_groups[0]['lr'] = lr
                noise_strength = latent_std * 0.05 * max(0, 1 - t / 0.75) ** 2
                latent_n = latent_noise(z_tiled, noise_strength.item())
                img_gen = gan.synthesis(latent_n)

                p_loss = lpips(img_gen, x_tiled).sum()
                norm_loss = norm(img_gen, x_tiled).mean(dim=(1, 2, 3)).sum()
                # --- IMPORTANT ---
                # different from normal network training where a fixed set of parameters are updated,
                # in this case, each instance in a batch has its own parameters to update,
                # therefore, the total loss should not be averaged over number of instances
                loss = w_lpips * p_loss + norm_loss
                loss_logging = per_pixel_l2_dist(img_gen, x_tiled)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if (i + 1) % 50 == 0:
                    print(f'===> Iter: {i+1} | '
                          f'PP-L2: {loss_logging.item():.6f} | '
                          f'Perceptual: {p_loss.mean().item():.6f} | '
                          f'Time: {time.time()-start_time:.3f}')
                    start_time = time.time()

                schedular.step()

        gen = gan.synthesis(z_tiled)
        loss_tiled = (gen - x_tiled).pow(2).mean(dim=(1, 2, 3)) + w_lpips * lpips(gen, x_tiled)
        loss_tiled = loss_tiled.view(-1, rec_rr)  # (B, r)
        indices = torch.argmin(loss_tiled, dim=1)
        offsets = torch.arange(0, batch_size).cuda() * rec_rr

        return gen[indices + offsets].detach().clone(), z_tiled[indices + offsets].detach().clone()
    if os.path.isfile(result_path):
        result_dict = torch.load(result_path)
        images_adv = result_dict['input'].cuda()
        images_def = result_dict['rec'].cuda()

    else:
        # print(images)
        images_adv = attacker.perturb(images)
        images_def, z_def = proj_fn(images_adv)
        # print(images_adv)
        print(images_def)
        # torch.save({'input': images_adv,
        #             'rec': images_def,
        #             'z_rec': z_def}, result_path)

    l2_dist = per_pixel_l2_dist(images, images_adv)

    if i % 1 == 0:
        clean_path = os.path.join(vis_dir, 'batch_{:04d}_clean.png'.format(i))
        adv_path = os.path.join(vis_dir, 'batch_{:04d}_adv.png'.format(i))
        def_path = os.path.join(vis_dir, 'batch_{:04d}_def.png'.format(i))
        save_image(images, clean_path, nrow=10, padding=2)
        save_image(images_adv, adv_path, nrow=10, padding=2)
        save_image(images_def, def_path, nrow=10, padding=2)

    with torch.no_grad():
        pred_clean = predict(images).argmax(dim=1)
        pred_adv = predict(images_adv).argmax(dim=1)
        pred_def = predict(images_def).argmax(dim=1)

        total += labels.size(0)