Exemple #1
0
def create_patches(img_basenames,
                   annotation_dir,
                   image_dir,
                   size,
                   step,
                   grayscale=True,
                   progressbar=True,
                   downsample=1,
                   objectclass=None,
                   negative_discard_rate=.9):
    '''Extract a set of image patches with labels, from the supplied list of
    annotated images. Positive-labelled patches are extracted centered on the
    annotated bounding box; negative-labelled patches are extracted at random
    from any part of the image which does not overlap an annotated bounding box.'''
    if progressbar:
        pb = ProgressBar(len(img_basenames))

    if not annotation_dir[-1] == os.path.sep:
        annotation_dir = annotation_dir + os.path.sep

    if not image_dir[-1] == os.path.sep:
        image_dir = image_dir + os.path.sep

    color_type = 0

    if grayscale:
        channels = 1

    else:
        channels = 3

    pos = []
    neg = []
    s = 1
    for img_filename in img_basenames:
        if progressbar:
            pb.step(s)
        s += 1
        annotation_filename = annotation_dir + img_filename[:-3] + 'xml'
        boundingboxes = get_bounding_boxes_for_single_image(
            annotation_filename, objectclass)
        #colortype = cv2.IMREAD_COLOR

        #img = cv2.imread(image_dir + img_filename, colortype)
        img = misc.imread(image_dir + img_filename)
        height, width, channels = img.shape
        img = img.reshape((height, width, channels))
        img = np.rollaxis(img, 2)
        image_pos = get_image_positives(img,
                                        boundingboxes,
                                        size,
                                        downsample=downsample)
        pos.append(image_pos)

        image_neg = get_image_negatives(img,
                                        boundingboxes,
                                        size,
                                        step,
                                        downsample=downsample,
                                        discard_rate=negative_discard_rate)
        neg.append(image_neg)

    pos = [item for sublist in pos for item in sublist]
    neg = [item for sublist in neg for item in sublist]
    patches = pos + neg

    index = np.arange(len(patches))
    np.random.seed(0)
    np.random.shuffle(index)

    np_patches = np.empty(
        (len(patches), channels, size / downsample, size / downsample),
        dtype=np.uint8)
    np_labels = np.empty(len(patches), dtype=int)

    max_pos = len(pos)
    for i, j in zip(index, xrange(len(index))):
        if i < max_pos:
            np_patches[j, ] = pos[i]
            np_labels[j] = 1
        else:
            np_patches[j, ] = neg[i - max_pos]
            np_labels[j] = 0

    np_labels = np_labels.astype(np.uint8)
    return np_labels, np_patches
Exemple #2
0
def main():
    progress_bar = ProgressBar()
    data_iterator, glove_embeddings, word_to_ix, ix_to_word = load_data()
    logger.info("Building model...")
    model = LSTMTagger(cf.EMBEDDING_DIM, cf.HIDDEN_DIM, len(word_to_ix),
                       cf.BATCH_SIZE, cf.MAX_SENT_LENGTH, glove_embeddings)
    # Ensure the word embeddings aren't modified during training
    optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=0.1)
    model.cuda()
    #if(cf.LOAD_PRETRAINED_MODEL):
    #	model.load_state_dict(torch.load('asset/model_trained'))
    #else:
    num_batches = len(data_iterator)
    loss_list = []  # A place to store the loss history
    for epoch in range(1, cf.MAX_EPOCHS + 1):
        epoch_start_time = time.time()
        for (i, (batch_x, batch_y)) in enumerate(data_iterator):
            # Ignore batch if it is not the same size as the others (happens at the end sometimes)
            if len(batch_x) != cf.BATCH_SIZE:
                continue
            batch_x = batch_x.to(device)
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            model.hidden = model.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
            #sentence_in = prepare_sequence(sentence, word_to_ix)
            #target = torch.tensor([word_to_ix[tag]], dtype=torch.long, device=device)

            batch_x_lengths = []
            for x in batch_x:
                batch_x_lengths.append(len(x))

            # Step 3. Run our forward pass.
            tag_scores = model(batch_x, batch_x_lengths)

            #loss = loss_function(tag_scores, batch_y)
            loss = modified_loss(tag_scores, batch_y, batch_x_lengths,
                                 word_to_ix)

            loss.backward()
            optimizer.step()
            progress_bar.draw_bar(i, epoch, num_batches, cf.MAX_EPOCHS,
                                  epoch_start_time)

        progress_bar.draw_completed_epoch(loss, loss_list, epoch,
                                          cf.MAX_EPOCHS, epoch_start_time)

        loss_list.append(loss)
        if epoch % 10 == 0:
            avg_loss = sum([l for l in loss_list[epoch - 10:]]) / 10
            logger.info("Average loss over past 10 epochs: %.6f" % avg_loss)
            if epoch >= 20:
                prev_avg_loss = sum(
                    [l for l in loss_list[epoch - 20:epoch - 10]]) / 10
                if (avg_loss >= prev_avg_loss):
                    logger.info(
                        "Average loss has not improved over past 10 epochs. Stopping early."
                    )
                    evaluate_model(model, ix_to_word)
                    break
        if epoch == 1 or epoch % 10 == 0 or epoch == cf.MAX_EPOCHS:
            evaluate_model(model, ix_to_word)

    logger.info("Saving model...")
    torch.save(model.state_dict(), "asset/model_trained")
    logger.info("Model saved to %s." % "asset/model_trained")
Exemple #3
0
def main():
    a = get_args()

    prev_enc = 0

    def train(i):
        loss = 0

        noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                      1).cuda() if a.noise > 0 else None
        img_out = image_f(noise)

        micro = None if a.in_txt2 is None else False
        imgs_sliced = slice_imgs([img_out],
                                 a.samples,
                                 a.modsize,
                                 norm_in,
                                 a.overscan,
                                 micro=micro)
        out_enc = model_clip.encode_image(imgs_sliced[-1])
        if a.diverse != 0:
            imgs_sliced = slice_imgs([image_f(noise)],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=micro)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += a.diverse * torch.cosine_similarity(
                out_enc, out_enc2, dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.in_img is not None and os.path.isfile(a.in_img):  # input image
            loss += sign * 0.5 * torch.cosine_similarity(
                img_enc, out_enc, dim=-1).mean()
        if a.in_txt is not None:  # input text
            loss += sign * torch.cosine_similarity(txt_enc, out_enc,
                                                   dim=-1).mean()
            if a.notext > 0:
                loss -= sign * a.notext * torch.cosine_similarity(
                    txt_plot_enc, out_enc, dim=-1).mean()
        if a.in_txt0 is not None:  # subtract text
            loss += -sign * torch.cosine_similarity(txt_enc0, out_enc,
                                                    dim=-1).mean()
        if a.sync > 0 and a.in_img is not None and os.path.isfile(
                a.in_img):  # image composition
            loss -= a.sync * ssim_loss(
                F.interpolate(img_out, ssim_size).float(), img_in)
        if a.in_txt2 is not None:  # input text for micro details
            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=True)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += sign * torch.cosine_similarity(txt_enc2, out_enc2,
                                                   dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.expand > 0:
            global prev_enc
            if i > 0:
                loss += a.expand * torch.cosine_similarity(
                    out_enc, prev_enc, dim=-1).mean()
            prev_enc = out_enc.detach()

        del img_out, imgs_sliced, out_enc
        torch.cuda.empty_cache()
        assert not isinstance(loss, int), ' Loss not defined, check the inputs'

        if a.prog is True:
            lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
            for g in optimizer.param_groups:
                g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % a.fstep == 0:
            with torch.no_grad():
                img = image_f(contrast=a.contrast).cpu().numpy()[0]
            checkout(img,
                     os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                     verbose=a.verbose)
            pbar.upd()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])

    if a.multilang is True:
        model_lang = SentenceTransformer(
            'clip-ViT-B-32-multilingual-v1').cuda()

    def enc_text(txt):
        if a.multilang is True:
            emb = model_lang.encode([txt],
                                    convert_to_tensor=True,
                                    show_progress_bar=False)
        else:
            emb = model_clip.encode_text(clip.tokenize(txt).cuda())
        return emb.detach().clone()

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)

    norm_in = torchvision.transforms.Normalize(
        (0.48145466, 0.4578275, 0.40821073),
        (0.26862954, 0.26130258, 0.27577711))

    out_name = []
    if a.in_txt is not None:
        if a.verbose is True: print(' ref text: ', basename(a.in_txt))
        if a.translate:
            translator = Translator()
            a.in_txt = translator.translate(a.in_txt, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt)
        txt_enc = enc_text(a.in_txt)
        out_name.append(txt_clean(a.in_txt))

        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) /
                                        255.).unsqueeze(0).permute(0, 3, 1,
                                                                   2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()

    if a.in_txt2 is not None:
        if a.verbose is True: print(' micro text:', basename(a.in_txt2))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt2 = translator.translate(a.in_txt2, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt2)
        txt_enc2 = enc_text(a.in_txt2)
        out_name.append(txt_clean(a.in_txt2))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0)
        txt_enc0 = enc_text(a.in_txt0)
        out_name.append('off-' + txt_clean(a.in_txt0))

    if a.multilang is True: del model_lang

    if a.in_img is not None and os.path.isfile(a.in_img):
        if a.verbose is True: print(' ref image:', basename(a.in_img))
        img_in = torch.from_numpy(
            img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda()
        img_in = img_in[:, :3, :, :]  # fix rgb channels
        in_sliced = slice_imgs([img_in],
                               a.samples,
                               a.modsize,
                               transform=norm_in,
                               overscan=a.overscan)[0]
        img_enc = model_clip.encode_image(in_sliced).detach().clone()
        if a.sync > 0:
            ssim_loss = ssim.SSIM(window_size=11)
            ssim_size = [s // 8 for s in a.size]
            img_in = F.interpolate(img_in, ssim_size).float()
        else:
            del img_in
        del in_sliced
        torch.cuda.empty_cache()
        out_name.append(basename(a.in_img).replace(' ', '_'))

    params, image_f = fft_image([1, 3, *a.size],
                                resume=a.resume,
                                decay_power=a.decay)
    image_f = to_valid_rgb(image_f, colors=a.colors)

    if a.prog is True:
        lr1 = a.lrate * 2
        lr0 = lr1 * 0.01
    else:
        lr0 = a.lrate
    optimizer = torch.optim.Adam(params, lr0)
    sign = 1. if a.invert is True else -1.

    if a.verbose is True: print(' samples:', a.samples)
    out_name = '-'.join(out_name)
    out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
    tempdir = os.path.join(a.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    pbar = ProgressBar(a.steps // a.fstep)
    for i in range(a.steps):
        train(i)

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(a.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps)))
    if a.save_pt is True:
        torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
Exemple #4
0
def main():
    args = get_args()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    clip_model, _ = clip.load(args.model)
    print(f"Using model {args.model}")

    input_text = args.input_text
    print(f"Generating from '{input_text}'")

    out_name_list = []
    out_name_list.append(txt_clean(input_text))
    out_name = '-'.join(out_name_list)
    out_name += '-%s' % args.model if 'RN' in args.model.upper() else ''

    tempdir = os.path.join(args.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    tokenized_text = clip.tokenize([input_text]).to(device).detach().clone()
    text_logits = clip_model.encode_text(tokenized_text)

    num_channels = 3
    spectrum_size = [args.batch_size, num_channels, *args.size]
    fft_img, img_freqs = get_fft_img(
        spectrum_size,
        std=0.01,
        return_img_freqs=True,
    )

    fft_img = fft_img.to(device)
    fft_img.requires_grad = True

    scale = get_scale_from_img_freqs(
        img_freqs=img_freqs,
        decay_power=args.decay,
    )

    scale = scale.to(device)

    shift = None
    if args.noise > 0:
        img_size = img_freqs.shape
        noise_size = (1, 1, *img_size, 1)
        shift = self.noise * torch.randn(noise_size, ).to(self.device)

    optimizer = torch.optim.Adam(
        [fft_img],
        args.lrate,
    )

    sign = -1

    pbar = ProgressBar(args.num_steps // args.save_freq)

    num_steps = args.num_steps
    num_crops = 200
    crop_size = 224

    for step in range(num_steps):
        loss = 0

        initial_img = fft_to_rgb(
            fft_img=fft_img,
            scale=scale,
            img_size=args.size,
            shift=shift,
            contrast=1.0,
            decorrelate=True,
            device=device,
        )

        crop_img_out = random_crop(
            initial_img,
            num_crops,
            crop_size,
            normalize=True,
        )
        img_logits = clip_model.encode_image(crop_img_out).to(device)
        tokenized_text = clip.tokenize([input_text]).to(device)
        text_logits = clip_model.encode_text(tokenized_text)

        loss += -torch.cosine_similarity(
            text_logits,
            img_logits,
            dim=-1,
        ).mean()

        torch.cuda.empty_cache()

        # if self.prog is True:
        #     lr_cur = lr + (step / self.steps) * (init_lr - lr)
        #     for g in self.optimizer.param_groups:
        #         g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % args.save_freq == 0:
            with torch.no_grad():
                img = fft_to_rgb(
                    fft_img=fft_img,
                    scale=scale,
                    img_size=args.size,
                    shift=shift,
                    contrast=1.0,
                    decorrelate=True,
                    device=device,
                )
                img = img.cpu().numpy()

            img_out_path = os.path.join(tempdir,
                                        '%04d.jpg' % (step // args.save_freq))
            checkout(
                img[0],
                img_out_path,
            )

            if pbar is not None:
                pbar.upd()

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(args.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(out_dir, '%s-%d.jpg' % (out_name, num_steps)))

    if args.save_pt is True:
        torch.save(fft_img, '%s.pt' % os.path.join(out_dir, out_name))
Exemple #5
0
# get image
img, aspect_ratio = load_image()
# set boundaries
boundary = Boundary(-1, img.size[0], -1, img.size[1])
# init figure and axes
fig = plt.figure(figsize=(fig_height * aspect_ratio, fig_height),
                 dpi=video_height / fig_height)
ax = fig.add_axes(rect=[0., 0., 1., 1.])
# hide axis
ax.set_axis_off()
# set axes limits
ax.set_xlim(boundary.l, boundary.r)
ax.set_ylim(boundary.b, boundary.t)
# init progress bar
progressbar = ProgressBar(0, frame_count_estimate, "Running animation")


# init_func called before the first frame
def init_animation():
    global balls, scat
    progressbar.start()
    # init balls
    balls = []
    for col in range(img.size[0]):
        for row in range(img.size[1]):
            if img.getpixel((col, row))[3] > 0:
                x = float(col)
                y = float(img.size[1] - row - 1)
                # rgba 0-255 to rgba 0-1
                color = [rgba / 255 for rgba in img.getpixel((col, row))]