def create_patches(img_basenames, annotation_dir, image_dir, size, step, grayscale=True, progressbar=True, downsample=1, objectclass=None, negative_discard_rate=.9): '''Extract a set of image patches with labels, from the supplied list of annotated images. Positive-labelled patches are extracted centered on the annotated bounding box; negative-labelled patches are extracted at random from any part of the image which does not overlap an annotated bounding box.''' if progressbar: pb = ProgressBar(len(img_basenames)) if not annotation_dir[-1] == os.path.sep: annotation_dir = annotation_dir + os.path.sep if not image_dir[-1] == os.path.sep: image_dir = image_dir + os.path.sep color_type = 0 if grayscale: channels = 1 else: channels = 3 pos = [] neg = [] s = 1 for img_filename in img_basenames: if progressbar: pb.step(s) s += 1 annotation_filename = annotation_dir + img_filename[:-3] + 'xml' boundingboxes = get_bounding_boxes_for_single_image( annotation_filename, objectclass) #colortype = cv2.IMREAD_COLOR #img = cv2.imread(image_dir + img_filename, colortype) img = misc.imread(image_dir + img_filename) height, width, channels = img.shape img = img.reshape((height, width, channels)) img = np.rollaxis(img, 2) image_pos = get_image_positives(img, boundingboxes, size, downsample=downsample) pos.append(image_pos) image_neg = get_image_negatives(img, boundingboxes, size, step, downsample=downsample, discard_rate=negative_discard_rate) neg.append(image_neg) pos = [item for sublist in pos for item in sublist] neg = [item for sublist in neg for item in sublist] patches = pos + neg index = np.arange(len(patches)) np.random.seed(0) np.random.shuffle(index) np_patches = np.empty( (len(patches), channels, size / downsample, size / downsample), dtype=np.uint8) np_labels = np.empty(len(patches), dtype=int) max_pos = len(pos) for i, j in zip(index, xrange(len(index))): if i < max_pos: np_patches[j, ] = pos[i] np_labels[j] = 1 else: np_patches[j, ] = neg[i - max_pos] np_labels[j] = 0 np_labels = np_labels.astype(np.uint8) return np_labels, np_patches
def main(): progress_bar = ProgressBar() data_iterator, glove_embeddings, word_to_ix, ix_to_word = load_data() logger.info("Building model...") model = LSTMTagger(cf.EMBEDDING_DIM, cf.HIDDEN_DIM, len(word_to_ix), cf.BATCH_SIZE, cf.MAX_SENT_LENGTH, glove_embeddings) # Ensure the word embeddings aren't modified during training optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.1) model.cuda() #if(cf.LOAD_PRETRAINED_MODEL): # model.load_state_dict(torch.load('asset/model_trained')) #else: num_batches = len(data_iterator) loss_list = [] # A place to store the loss history for epoch in range(1, cf.MAX_EPOCHS + 1): epoch_start_time = time.time() for (i, (batch_x, batch_y)) in enumerate(data_iterator): # Ignore batch if it is not the same size as the others (happens at the end sometimes) if len(batch_x) != cf.BATCH_SIZE: continue batch_x = batch_x.to(device) # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. model.hidden = model.init_hidden() # Step 2. Get our inputs ready for the network, that is, turn them into # Tensors of word indices. #sentence_in = prepare_sequence(sentence, word_to_ix) #target = torch.tensor([word_to_ix[tag]], dtype=torch.long, device=device) batch_x_lengths = [] for x in batch_x: batch_x_lengths.append(len(x)) # Step 3. Run our forward pass. tag_scores = model(batch_x, batch_x_lengths) #loss = loss_function(tag_scores, batch_y) loss = modified_loss(tag_scores, batch_y, batch_x_lengths, word_to_ix) loss.backward() optimizer.step() progress_bar.draw_bar(i, epoch, num_batches, cf.MAX_EPOCHS, epoch_start_time) progress_bar.draw_completed_epoch(loss, loss_list, epoch, cf.MAX_EPOCHS, epoch_start_time) loss_list.append(loss) if epoch % 10 == 0: avg_loss = sum([l for l in loss_list[epoch - 10:]]) / 10 logger.info("Average loss over past 10 epochs: %.6f" % avg_loss) if epoch >= 20: prev_avg_loss = sum( [l for l in loss_list[epoch - 20:epoch - 10]]) / 10 if (avg_loss >= prev_avg_loss): logger.info( "Average loss has not improved over past 10 epochs. Stopping early." ) evaluate_model(model, ix_to_word) break if epoch == 1 or epoch % 10 == 0 or epoch == cf.MAX_EPOCHS: evaluate_model(model, ix_to_word) logger.info("Saving model...") torch.save(model.state_dict(), "asset/model_trained") logger.info("Model saved to %s." % "asset/model_trained")
def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) micro = None if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity( out_enc, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * 0.5 * torch.cosine_similarity( img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.notext > 0: loss -= sign * a.notext * torch.cosine_similarity( txt_plot_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile( a.in_img): # image composition loss -= a.sync * ssim_loss( F.interpolate(img_out, ssim_size).float(), img_in) if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity( out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.multilang is True: model_lang = SentenceTransformer( 'clip-ViT-B-32-multilingual-v1').cuda() def enc_text(txt): if a.multilang is True: emb = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False) else: emb = model_clip.encode_text(clip.tokenize(txt).cuda()) return emb.detach().clone() if a.diverse != 0: a.samples = int(a.samples * 0.5) norm_in = torchvision.transforms.Normalize( (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) out_name = [] if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) txt_enc = enc_text(a.in_txt) out_name.append(txt_clean(a.in_txt)) if a.notext > 0: txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone() if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) txt_enc2 = enc_text(a.in_txt2) out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) txt_enc0 = enc_text(a.in_txt0) out_name.append('off-' + txt_clean(a.in_txt0)) if a.multilang is True: del model_lang if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy( img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() img_in = img_in[:, :3, :, :] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transform=norm_in, overscan=a.overscan)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: ssim_loss = ssim.SSIM(window_size=11) ssim_size = [s // 8 for s in a.size] img_in = F.interpolate(img_in, ssim_size).float() else: del img_in del in_sliced torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) params, image_f = fft_image([1, 3, *a.size], resume=a.resume, decay_power=a.decay) image_f = to_valid_rgb(image_f, colors=a.colors) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
def main(): args = get_args() device = "cuda" if torch.cuda.is_available() else "cpu" clip_model, _ = clip.load(args.model) print(f"Using model {args.model}") input_text = args.input_text print(f"Generating from '{input_text}'") out_name_list = [] out_name_list.append(txt_clean(input_text)) out_name = '-'.join(out_name_list) out_name += '-%s' % args.model if 'RN' in args.model.upper() else '' tempdir = os.path.join(args.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) tokenized_text = clip.tokenize([input_text]).to(device).detach().clone() text_logits = clip_model.encode_text(tokenized_text) num_channels = 3 spectrum_size = [args.batch_size, num_channels, *args.size] fft_img, img_freqs = get_fft_img( spectrum_size, std=0.01, return_img_freqs=True, ) fft_img = fft_img.to(device) fft_img.requires_grad = True scale = get_scale_from_img_freqs( img_freqs=img_freqs, decay_power=args.decay, ) scale = scale.to(device) shift = None if args.noise > 0: img_size = img_freqs.shape noise_size = (1, 1, *img_size, 1) shift = self.noise * torch.randn(noise_size, ).to(self.device) optimizer = torch.optim.Adam( [fft_img], args.lrate, ) sign = -1 pbar = ProgressBar(args.num_steps // args.save_freq) num_steps = args.num_steps num_crops = 200 crop_size = 224 for step in range(num_steps): loss = 0 initial_img = fft_to_rgb( fft_img=fft_img, scale=scale, img_size=args.size, shift=shift, contrast=1.0, decorrelate=True, device=device, ) crop_img_out = random_crop( initial_img, num_crops, crop_size, normalize=True, ) img_logits = clip_model.encode_image(crop_img_out).to(device) tokenized_text = clip.tokenize([input_text]).to(device) text_logits = clip_model.encode_text(tokenized_text) loss += -torch.cosine_similarity( text_logits, img_logits, dim=-1, ).mean() torch.cuda.empty_cache() # if self.prog is True: # lr_cur = lr + (step / self.steps) * (init_lr - lr) # for g in self.optimizer.param_groups: # g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if step % args.save_freq == 0: with torch.no_grad(): img = fft_to_rgb( fft_img=fft_img, scale=scale, img_size=args.size, shift=shift, contrast=1.0, decorrelate=True, device=device, ) img = img.cpu().numpy() img_out_path = os.path.join(tempdir, '%04d.jpg' % (step // args.save_freq)) checkout( img[0], img_out_path, ) if pbar is not None: pbar.upd() os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(args.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(out_dir, '%s-%d.jpg' % (out_name, num_steps))) if args.save_pt is True: torch.save(fft_img, '%s.pt' % os.path.join(out_dir, out_name))
# get image img, aspect_ratio = load_image() # set boundaries boundary = Boundary(-1, img.size[0], -1, img.size[1]) # init figure and axes fig = plt.figure(figsize=(fig_height * aspect_ratio, fig_height), dpi=video_height / fig_height) ax = fig.add_axes(rect=[0., 0., 1., 1.]) # hide axis ax.set_axis_off() # set axes limits ax.set_xlim(boundary.l, boundary.r) ax.set_ylim(boundary.b, boundary.t) # init progress bar progressbar = ProgressBar(0, frame_count_estimate, "Running animation") # init_func called before the first frame def init_animation(): global balls, scat progressbar.start() # init balls balls = [] for col in range(img.size[0]): for row in range(img.size[1]): if img.getpixel((col, row))[3] > 0: x = float(col) y = float(img.size[1] - row - 1) # rgba 0-255 to rgba 0-1 color = [rgba / 255 for rgba in img.getpixel((col, row))]