def process(txt, num): sd = 0.01 if a.keep > 0: sd = a.keep + (1-a.keep) * sd params, image_f = fft_image([1, 3, *a.size], resume='init.pt', sd=sd, decay_power=a.decay) image_f = to_valid_rgb(image_f, colors = a.colors) if a.prog is True: lr1 = a.lrate * 2 lr0 = a.lrate * 0.1 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) if a.verbose is True: print(' ref text: ', txt) if a.translate: translator = Translator() txt = translator.translate(txt, dest='en').text if a.verbose is True: print(' translated to:', txt) if a.multilang is True: model_lang = SentenceTransformer('clip-ViT-B-32-multilingual-v1').cuda() txt_enc = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False).detach().clone() del model_lang else: txt_enc = model_clip.encode_text(clip.tokenize(txt).cuda()).detach().clone() if a.notext > 0: txt_plot = torch.from_numpy(plot_text(txt, a.modsize)/255.).unsqueeze(0).permute(0,3,1,2).cuda() txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone() else: txt_plot_enc = None out_name = '%03d-%s' % (num+1, txt_clean(txt)) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(workdir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) if a.sharp != 0: lx = torch.mean(torch.abs(img_out[0,:,:,1:] - img_out[0,:,:,:-1])) ly = torch.mean(torch.abs(img_out[0,:,1:,:] - img_out[0,:,:-1,:])) loss -= a.sharp * (ly+lx) imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=1.) out_enc = model_clip.encode_image(imgs_sliced[-1]) loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.notext > 0: loss += a.notext * torch.cosine_similarity(txt_plot_enc, out_enc, dim=-1).mean() if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, trform_f, a.align, micro=1.) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach().clone() if a.in_txt0 is not None: # subtract text loss += torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache() if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] if a.sharp != 0: img = img **1.3 # empirical tone mapping checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() del img if a.keep > 0: global params_start, params_ema params_ema = ema(params_ema, params[0].detach().clone(), num+1) torch.save((1-a.keep) * params_start + a.keep * params_ema, 'init.pt') torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name)) shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, a.steps))) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(workdir, out_name)))
def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) if a.sharp != 0: lx = torch.mean( torch.abs(img_out[0, :, :, 1:] - img_out[0, :, :, :-1])) ly = torch.mean( torch.abs(img_out[0, :, 1:, :] - img_out[0, :, :-1, :])) loss -= a.sharp * (ly + lx) micro = 1 - a.macro if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, trform_f, a.align, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity( out_enc, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * 0.5 * torch.cosine_similarity( img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.notext > 0: loss -= sign * a.notext * torch.cosine_similarity( txt_plot_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile( a.in_img): # image composition prog_sync = (a.steps // a.fstep - i) / (a.steps // a.fstep) loss += prog_sync * a.sync * sim_loss(F.interpolate( img_out, sim_size).float(), img_in, normalize=True).squeeze() if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity( out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] if (a.sync > 0 and a.in_img is not None) or a.sharp != 0: img = img**1.3 # empirical tone mapping checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.multilang is True: model_lang = SentenceTransformer( 'clip-ViT-B-32-multilingual-v1').cuda() def enc_text(txt): if a.multilang is True: emb = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False) else: emb = model_clip.encode_text(clip.tokenize(txt).cuda()) return emb.detach().clone() if a.diverse != 0: a.samples = int(a.samples * 0.5) if a.sync > 0: a.samples = int(a.samples * 0.5) if a.transform is True: trform_f = transforms.transforms_custom a.samples = int(a.samples * 0.95) else: trform_f = transforms.normalize() out_name = [] if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) txt_enc = enc_text(a.in_txt) out_name.append(txt_clean(a.in_txt)) if a.notext > 0: txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone() if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) txt_enc2 = enc_text(a.in_txt2) out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) txt_enc0 = enc_text(a.in_txt0) out_name.append('off-' + txt_clean(a.in_txt0)) if a.multilang is True: del model_lang if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy( img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() img_in = img_in[:, :3, :, :] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transforms.normalize(), a.align, micro=False)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: sim_loss = lpips.LPIPS(net='vgg', verbose=False).cuda() sim_size = [s // 2 for s in a.size] img_in = F.interpolate(img_in, sim_size).float() else: del img_in del in_sliced torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) params, image_f = fft_image([1, 3, *a.size], resume=a.resume, decay_power=a.decay) image_f = to_valid_rgb(image_f, colors=a.colors) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
def process(txt, num): params, image_f = fft_image([1, 3, *a.size], resume='init.pt') image_f = to_valid_rgb(image_f) if a.prog is True: lr1 = a.lrate * 2 lr0 = a.lrate * 0.1 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) if a.verbose is True: print(' ref text: ', txt) if a.translate: translator = Translator() txt = translator.translate(txt, dest='en').text if a.verbose is True: print(' translated to:', txt) tx = clip.tokenize(txt).cuda() txt_enc = model_clip.encode_text(tx).detach().clone() out_name = '%03d-%s' % (num+1, txt_clean(txt)) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(workdir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=None) out_enc = model_clip.encode_image(imgs_sliced[-1]) loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.diverse > 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, norm_in, a.overscan, micro=None) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() if a.in_txt0 is not None: # subtract text loss += torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache() if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() del img if a.keep > 0: global params_start, params_ema params_ema = ema(params_ema, params[0].detach(), num+1) torch.save((1-a.keep) * params_start + a.keep * params_ema, 'init.pt') torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name)) shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, a.steps))) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(workdir, out_name)))
def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) micro = None if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse > 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * torch.cosine_similarity(img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile(a.in_img): # image composition loss *= 1. + a.sync * (a.steps/(i+1) * ssim_loss(img_out, img_in) - 1) if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50':0.5, 'RN50x4':0.16, 'RN101':0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.diverse > 0: a.samples = int(a.samples * 0.5) norm_in = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) out_name = [] if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy(img_read(a.in_img)/255.).unsqueeze(0).permute(0,3,1,2).cuda() img_in = img_in[:,:3,:,:] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transform=norm_in, overscan=a.overscan)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: ssim_loss = ssim.SSIM(window_size = 11) img_in = F.interpolate(img_in, a.size).float() else: del img_in del in_sliced; torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) tx = clip.tokenize(a.in_txt).cuda() txt_enc = model_clip.encode_text(tx).detach().clone() out_name.append(txt_clean(a.in_txt)) if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) tx2 = clip.tokenize(a.in_txt2).cuda() txt_enc2 = model_clip.encode_text(tx2).detach().clone() out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) tx0 = clip.tokenize(a.in_txt0).cuda() txt_enc0 = model_clip.encode_text(tx0).detach().clone() out_name.append('off-' + txt_clean(a.in_txt0)) params, image_f = fft_image([1, 3, *a.size], resume=a.resume) image_f = to_valid_rgb(image_f) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy(img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
def main(): args = get_args() device = "cuda" if torch.cuda.is_available() else "cpu" clip_model, _ = clip.load(args.model) print(f"Using model {args.model}") input_text = args.input_text print(f"Generating from '{input_text}'") out_name_list = [] out_name_list.append(txt_clean(input_text)) out_name = '-'.join(out_name_list) out_name += '-%s' % args.model if 'RN' in args.model.upper() else '' tempdir = os.path.join(args.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) tokenized_text = clip.tokenize([input_text]).to(device).detach().clone() text_logits = clip_model.encode_text(tokenized_text) num_channels = 3 spectrum_size = [args.batch_size, num_channels, *args.size] fft_img, img_freqs = get_fft_img( spectrum_size, std=0.01, return_img_freqs=True, ) fft_img = fft_img.to(device) fft_img.requires_grad = True scale = get_scale_from_img_freqs( img_freqs=img_freqs, decay_power=args.decay, ) scale = scale.to(device) shift = None if args.noise > 0: img_size = img_freqs.shape noise_size = (1, 1, *img_size, 1) shift = self.noise * torch.randn(noise_size, ).to(self.device) optimizer = torch.optim.Adam( [fft_img], args.lrate, ) sign = -1 pbar = ProgressBar(args.num_steps // args.save_freq) num_steps = args.num_steps num_crops = 200 crop_size = 224 for step in range(num_steps): loss = 0 initial_img = fft_to_rgb( fft_img=fft_img, scale=scale, img_size=args.size, shift=shift, contrast=1.0, decorrelate=True, device=device, ) crop_img_out = random_crop( initial_img, num_crops, crop_size, normalize=True, ) img_logits = clip_model.encode_image(crop_img_out).to(device) tokenized_text = clip.tokenize([input_text]).to(device) text_logits = clip_model.encode_text(tokenized_text) loss += -torch.cosine_similarity( text_logits, img_logits, dim=-1, ).mean() torch.cuda.empty_cache() # if self.prog is True: # lr_cur = lr + (step / self.steps) * (init_lr - lr) # for g in self.optimizer.param_groups: # g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if step % args.save_freq == 0: with torch.no_grad(): img = fft_to_rgb( fft_img=fft_img, scale=scale, img_size=args.size, shift=shift, contrast=1.0, decorrelate=True, device=device, ) img = img.cpu().numpy() img_out_path = os.path.join(tempdir, '%04d.jpg' % (step // args.save_freq)) checkout( img[0], img_out_path, ) if pbar is not None: pbar.upd() os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(args.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(out_dir, '%s-%d.jpg' % (out_name, num_steps))) if args.save_pt is True: torch.save(fft_img, '%s.pt' % os.path.join(out_dir, out_name))