Python load Examples

Programming Language: Python

Namespace/Package Name: clip.clip

Method/Function: load

Examples at hotexamples.com: 6

Python load - 6 examples found. These are the top rated real world Python examples of clip.clip.load extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: 08_blindfold_attentions.py Project: johntd54/investigate_clip_attention

def predict(query, image_path, n_off):

    MODEL_PATH = 'clip.pth'
    VOCAB_PATH = 'bpe_simple_vocab_16e6.txt.gz'

    model, transform = load('ViT-B/32', jit=False)
    for name, module in model.named_modules():
        if regex.match(f'visual.transformer.resblocks.\d+.attn$', name):
            module.register_forward_hook(get_blindfold_hook(n_off))

    tokenizer = SimpleTokenizer(bpe_path=VOCAB_PATH,
                                context_length=model.context_length)
    view_transform = transforms.Compose([
        transforms.Resize(224, interpolation=Image.BICUBIC),
        transforms.CenterCrop(224), lambda image: image.convert('RGB')
    ])
    is_fp16 = False

    # device = "cuda" if torch.cuda.is_available() else "cpu"
    device = 'cpu'
    if is_fp16:
        model.to(device=device).eval().half()
    else:
        model.to(device=device).eval().float()

    model.eval()
    with torch.no_grad():
        text = tokenizer.encode(query).to(device)
        text_features = model.encode_text(text)  # N_queries x 512

        # image_path = "/home/john/datasets/imagenet/object_localization/val/n01440764/ILSVRC2012_val_00002138.JPEG"
        image_name = Path(image_path).stem
        image_vis = np.asarray(view_transform(Image.open(image_path)))
        image = transform(Image.open(image_path)).unsqueeze(0).to(device)
        image_features = model.encode_image(image)  # 1 x 512

        visual_attention = get_attention_maps(model,
                                              visual=True)  #[<n_heads, t, t>]

        # for layer_n, each_attention_layer in enumerate(visual_attention):
        #     for idx in range(each_attention_layer.size(0)):
        #         vis = each_attention_layer[idx, 0, 1:].reshape(7,7).detach().numpy()
        #         vis -= vis.min()
        #         vis /= vis.max()
        #         vis = cv2.resize(vis, (224, 224))[...,np.newaxis]
        #         result = (vis * image_vis).astype(np.uint8)
        #         output_file = Path(f'logs/{image_name}/layer_{layer_n:02d}/head_{idx:02d}.png')
        #         output_file.parent.mkdir(parents=True, exist_ok=True)
        #         Image.fromarray(result).save(str(output_file))

        tries = []
        for idx in range(50):
            logits_per_image, logits_per_text = model(image, text)
            probs = logits_per_image.softmax(dim=-1).cpu().numpy().squeeze()
            tries.append(probs[0])

    return tries

Example #2

Show file

File: get_data_official.py Project: johntd54/investigate_clip_attention

def get_imagenet():
    # configs
    MODEL_PATH = 'clip.pth'
    VOCAB_PATH = 'bpe_simple_vocab_16e6.txt.gz'
    IMAGENET_PATH = '/home/john/john/data/imagenet'
    is_fp16 = False
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # initialize the model
    # model = CLIP(attention_probs_dropout_prob=0, hidden_dropout_prob=0)
    # model.load_state_dict(state_dict = torch.load(MODEL_PATH))
    model, transform = load('ViT-B/32', jit=False)
    if is_fp16:
        model.to(device=device).eval().half()
    else:
        model.to(device=device).eval().float()

    # initializer the tokenizer + image transform
    tokenizer = SimpleTokenizer(
            bpe_path=VOCAB_PATH,
            context_length=model.context_length)
    # transform = build_transform(model.input_resolution.item())

    # initialize the data
    data = datasets.ImageNet(IMAGENET_PATH, 'val', transform=transform)
    loader = DataLoader(data, batch_size=256, shuffle=False, num_workers=16)
    # important no shuffle

    # inference
    predictions = []
    ground_truths = []
    model.eval()
    with torch.no_grad():
        query = [f'a {", ".join(each)}' for each in data.classes]
        text = tokenizer.encode(query).to(device)

        for idx, (x, y) in enumerate(loader):
            x = x.to(device)
            image_pred, text_pred = model(x, text)
            predictions += image_pred.argmax(dim=-1).cpu().data.numpy().tolist()
            # print(predictions)
            ground_truths += y.data.numpy().tolist()

            # print(idx)
            if idx % 100 == 1:
                print(idx)

    return predictions, ground_truths

Example #3

Show file

    def __init__(self, config):
        self.config = config
        self.augmentation = None

        self.CLIP, clip_preprocess = clip.load("ViT-B/32",
                                               device=self.config.device)
        self.CLIP = self.CLIP.eval()
        freeze_model(self.CLIP)
        self.model = self.config.model(config).to(self.config.device).eval()
        freeze_model(self.model)

        if config.task == "txt2img":
            self.tokens = clip.tokenize([self.config.target
                                         ]).to(self.config.device)
            self.text_features = self.CLIP.encode_text(self.tokens).detach()
        if config.task == "img2txt":
            image = clip_preprocess(Image.open(
                self.config.target)).unsqueeze(0).to(self.config.device)
            self.image_features = self.CLIP.encode_image(image)

Example #4

Show file

File: visualize_attention_official.py Project: johntd54/investigate_clip_attention

        to_log = output.cpu().data.numpy()
    log = {'name': name, 'output': to_log}
    intermediate[idx] = log
    return output


# torch.nn.modules.module.register_module_forward_hook(debug_hook)

if __name__ == '__main__':

    MODEL_PATH = 'clip.pth'
    VOCAB_PATH = 'bpe_simple_vocab_16e6.txt.gz'

    # model = CLIP(attention_probs_dropout_prob=0, hidden_dropout_prob=0)
    # model.load_state_dict(state_dict = torch.load(MODEL_PATH))
    model, transform = load('ViT-B/32', jit=False)

    tokenizer = SimpleTokenizer(bpe_path=VOCAB_PATH,
                                context_length=model.context_length)
    # transform = build_transform(model.input_resolution.item())
    view_transform = transforms.Compose([
        transforms.Resize(224, interpolation=Image.BICUBIC),
        transforms.CenterCrop(224), lambda image: image.convert('RGB')
    ])
    is_fp16 = False

    # device = "cuda" if torch.cuda.is_available() else "cpu"
    device = 'cpu'
    if is_fp16:
        model.to(device=device).eval().half()
    else:

Example #5

Show file

def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    _transforms = [normalize]
    if use_clip:
        _, preprocess = clip.load('ViT-B/32')
        preprocess.transforms = preprocess.transforms[:2]
        _transforms = preprocess.transforms + _transforms
    _transforms = transforms.Compose(_transforms)
    loader = torch.utils.data.DataLoader(CaptionDataset(data_folder,
                                                        data_name,
                                                        'TEST',
                                                        transform=_transforms),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=1,
                                         pin_memory=True)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()

    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(
            image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(
            1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(
            k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        rev_word_map = {v: k for k, v in word_map.items()}
        if clip_beam_search:
            with torch.no_grad():
                image_features = encoder.clip_model.encode_image(image)
                image_features /= image_features.norm(dim=-1, keepdim=True)

        def get_clip_scores(seqs, scores):
            nonlocal top_k_scores
            special_words = ['<start>', '<end>']
            replace_words = {
                '<unk>': '<averyunpleasantword>',
                '<pad>': '<anotherveryunpleasantword>'
            }
            special_words_enc = [word_map[w] for w in special_words]
            if step == 1:
                top_k_scores, next_word_inds = scores[0].topk(
                    k, 0, True, True)  # (s)
                return torch.zeros(k, device=device).long(), next_word_inds
            next_word_inds = scores.topk(k)[1]
            inds = []

            text = []
            weights = torch.ones(k**2).to(device)
            count = 0
            for idx, (prev_seq, next_words) in enumerate(
                    zip(seqs.tolist(), next_word_inds.tolist())):
                prev_words = [
                    rev_word_map[w] for w in prev_seq
                    if w not in special_words_enc
                ]
                for word in next_words:
                    cap_words = copy.copy(prev_words)
                    if word not in special_words:
                        word_char = rev_word_map[word]
                        word_char = replace_words.get(word_char) or word_char
                        cap_words.append(word_char)
                    text.append(' '.join(cap_words))
                    inds.append([idx, word])
                    if rev_word_map[word] == '<end>':
                        weights[count] = 1.5
                    count += 1
            inds = np.array(inds)
            text = clip.tokenize(text).to(device)
            with torch.no_grad():
                text_features = encoder.clip_model.encode_text(text)

            # Pick the top k most similar captions for the image
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (image_features @ text_features.T *
                          weights).log_softmax(dim=-1)
            top_k_scores, indices = similarity.view(-1).topk(k, 0, True, True)
            prev_inds = torch.tensor([inds[idx][0] for idx in indices],
                                     device=device)
            next_inds = torch.tensor([inds[idx][1] for idx in indices],
                                     device=device)

            return prev_inds, next_inds

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out,
                                       h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(
                decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                       (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            if clip_beam_search:
                prev_word_inds, next_word_inds = get_clip_scores(seqs, scores)
            else:

                # For the first step, all k points will have the same scores (since same k previous words, h, c)
                if step == 1:
                    top_k_scores, top_k_words = scores[0].topk(
                        k, 0, True, True)  # (s)
                else:
                    # Unroll and find top scores, and their unrolled indices
                    top_k_scores, top_k_words = scores.view(-1).topk(
                        k, 0, True, True)  # (s)

                # Convert unrolled indices to actual indices of scores
                prev_word_inds = (top_k_words / vocab_size).long()  # (s)
                next_word_inds = (top_k_words % vocab_size).long()  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly
            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        if len(complete_inds) > 0:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            i = top_k_scores.argmax().item()
            seq = seqs[i].tolist()

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])

        assert len(references) == len(hypotheses)

    bleu4 = corpus_bleu(references,
                        hypotheses,
                        smoothing_function=SmoothingFunction().method1)

    return bleu4

Example #6

Show file

def main():
	# Model selection and download.
	imagenet_1024 = False
	imagenet_16384 = True
	coco = False
	faceshq = False
	wikiart_1024 = False
	wikiart_16384 = False
	sflckr = False
	openimages_8192 = False

	'''
	if imagenet_1024:
		# !curl -L -o vqgan_imagenet_f16_1024.yaml -C - 'http://mirror.io.community/blob/vqgan/vqgan_imagenet_f16_1024.yaml' #ImageNet 1024
		# !curl -L -o vqgan_imagenet_f16_1024.ckpt -C - 'http://mirror.io.community/blob/vqgan/vqgan_imagenet_f16_1024.ckpt'  #ImageNet 1024
		Popen("curl -L -o vqgan_imagenet_f16_1024.yaml -C - 'http://mirror.io.community/blob/vqgan/vqgan_imagenet_f16_1024.yaml'")
		Popen("curl -L -o vqgan_imagenet_f16_1024.ckpt -C - 'http://mirror.io.community/blob/vqgan/vqgan_imagenet_f16_1024.ckpt'")
	if imagenet_16384:
		# !curl -L -o vqgan_imagenet_f16_16384.yaml -C - 'https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/files/?p=%2Fconfigs%2Fmodel.yaml&dl=1' #ImageNet 16384
		# !curl -L -o vqgan_imagenet_f16_16384.ckpt -C - 'https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/files/?p=%2Fckpts%2Flast.ckpt&dl=1' #ImageNet 16384
		Popen("curl -L -o vqgan_imagenet_f16_16384.yaml -C - 'https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/files/?p=%2Fconfigs%2Fmodel.yaml&dl=1'")
		Popen("curl -L -o vqgan_imagenet_f16_16384.ckpt -C - 'https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/files/?p=%2Fckpts%2Flast.ckpt&dl=1'")
	if openimages_8192:
		# !curl -L -o vqgan_openimages_f16_8192.yaml -C - 'https://heibox.uni-heidelberg.de/d/2e5662443a6b4307b470/files/?p=%2Fconfigs%2Fmodel.yaml&dl=1' #ImageNet 16384
		# !curl -L -o vqgan_openimages_f16_8192.ckpt -C - 'https://heibox.uni-heidelberg.de/d/2e5662443a6b4307b470/files/?p=%2Fckpts%2Flast.ckpt&dl=1' #ImageNet 16384
		Popen("curl -L -o vqgan_openimages_f16_8192.yaml -C - 'https://heibox.uni-heidelberg.de/d/2e5662443a6b4307b470/files/?p=%2Fconfigs%2Fmodel.yaml&dl=1'")
		Popen("curl -L -o vqgan_openimages_f16_8192.ckpt -C - 'https://heibox.uni-heidelberg.de/d/2e5662443a6b4307b470/files/?p=%2Fckpts%2Flast.ckpt&dl=1'")
	if coco:
		# !curl -L -o coco.yaml -C - 'https://dl.nmkd.de/ai/clip/coco/coco.yaml' #COCO
		# !curl -L -o coco.ckpt -C - 'https://dl.nmkd.de/ai/clip/coco/coco.ckpt' #COCO
		Popen("curl -L -o coco.yaml -C - 'https://dl.nmkd.de/ai/clip/coco/coco.yaml'")
		Popen("curl -L -o coco.ckpt -C - 'https://dl.nmkd.de/ai/clip/coco/coco.ckpt'")
	if faceshq:
		# !curl -L -o faceshq.yaml -C - 'https://drive.google.com/uc?export=download&id=1fHwGx_hnBtC8nsq7hesJvs-Klv-P0gzT' #FacesHQ
		# !curl -L -o faceshq.ckpt -C - 'https://app.koofr.net/content/links/a04deec9-0c59-4673-8b37-3d696fe63a5d/files/get/last.ckpt?path=%2F2020-11-13T21-41-45_faceshq_transformer%2Fcheckpoints%2Flast.ckpt' #FacesHQ
		Popen("curl -L -o vqgan_imagenet_f16_16384.yaml -C - 'https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/files/?p=%2Fconfigs%2Fmodel.yaml&dl=1'")
		Popen("curl -L -o vqgan_imagenet_f16_16384.ckpt -C - 'https://heibox.uni-heidelberg.de/d/a7530b09fed84f80a887/files/?p=%2Fckpts%2Flast.ckpt&dl=1")
	if wikiart_1024: 
		# !curl -L -o wikiart_1024.yaml -C - 'http://mirror.io.community/blob/vqgan/wikiart.yaml' #WikiArt 1024
		# !curl -L -o wikiart_1024.ckpt -C - 'http://mirror.io.community/blob/vqgan/wikiart.ckpt' #WikiArt 1024
		Popen("curl -L -o wikiart_1024.yaml -C - 'http://mirror.io.community/blob/vqgan/wikiart.yaml'")
		Popen("curl -L -o wikiart_1024.ckpt -C - 'http://mirror.io.community/blob/vqgan/wikiart.ckpt'")
	if wikiart_16384: 
		# !curl -L -o wikiart_16384.yaml -C - 'http://mirror.io.community/blob/vqgan/wikiart_16384.yaml' #WikiArt 16384
		# !curl -L -o wikiart_16384.ckpt -C - 'http://mirror.io.community/blob/vqgan/wikiart_16384.ckpt' #WikiArt 16384
		Popen("curl -L -o wikiart_16384.yaml -C - 'http://mirror.io.community/blob/vqgan/wikiart_16384.yaml'")
		Popen("curl -L -o wikiart_16384.ckpt -C - 'http://mirror.io.community/blob/vqgan/wikiart_16384.ckpt'")
	if sflckr:
		# !curl -L -o sflckr.yaml -C - 'https://heibox.uni-heidelberg.de/d/73487ab6e5314cb5adba/files/?p=%2Fconfigs%2F2020-11-09T13-31-51-project.yaml&dl=1' #S-FLCKR
		# !curl -L -o sflckr.ckpt -C - 'https://heibox.uni-heidelberg.de/d/73487ab6e5314cb5adba/files/?p=%2Fcheckpoints%2Flast.ckpt&dl=1' #S-FLCKR
		Popen("curl -L -o sflckr.yaml -C - 'https://heibox.uni-heidelberg.de/d/73487ab6e5314cb5adba/files/?p=%2Fconfigs%2F2020-11-09T13-31-51-project.yaml&dl=1'")
		Popen("curl -L -o sflckr.ckpt -C - 'https://heibox.uni-heidelberg.de/d/73487ab6e5314cb5adba/files/?p=%2Fcheckpoints%2Flast.ckpt&dl=1'")
	'''

	ImageFile.LOAD_TRUNCATED_IMAGES = True

	def sinc(x):
		return torch.where(x != 0, torch.sin(math.pi * x) / (math.pi * x), x.new_ones([]))


	def lanczos(x, a):
		cond = torch.logical_and(-a < x, x < a)
		out = torch.where(cond, sinc(x) * sinc(x / a), x.new_zeros([]))
		return out / out.sum()


	def ramp(ratio, width):
		n = math.ceil(width / ratio + 1)
		out = torch.empty([n])
		cur = 0
		for i in range(out.shape[0]):
			out[i] = cur
			cur += ratio
		return torch.cat([-out[1:].flip([0]), out])[1:-1]


	def resample(input, size, align_corners=True):
		n, c, h, w = input.shape
		dh, dw = size

		input = input.view([n * c, 1, h, w])

		if dh < h:
			kernel_h = lanczos(ramp(dh / h, 2), 2).to(input.device, input.dtype)
			pad_h = (kernel_h.shape[0] - 1) // 2
			input = F.pad(input, (0, 0, pad_h, pad_h), "reflect")
			input = F.conv2d(input, kernel_h[None, None, :, None])

		if dw < w:
			kernel_w = lanczos(ramp(dw / w, 2), 2).to(input.device, input.dtype)
			pad_w = (kernel_w.shape[0] - 1) // 2
			input = F.pad(input, (pad_w, pad_w, 0, 0), "reflect")
			input = F.conv2d(input, kernel_w[None, None, None, :])

		input = input.view([n, c, h, w])
		return F.interpolate(
			input, size, mode="bicubic", align_corners=align_corners
		)


	class ReplaceGrad(torch.autograd.Function):
		@staticmethod
		def forward(ctx, x_forward, x_backward):
			ctx.shape = x_backward.shape
			return x_forward


		@staticmethod
		def backward(ctx, grad_in):
			return None, grad_in.sum_to_size(ctx.shape)


	replace_grad = ReplaceGrad.apply


	class ClampWithGrad(torch.autograd.Function):
		@staticmethod
		def forward(ctx, input, min, max):
			ctx.min = min
			ctx.max = max
			ctx.save_for_backward(input)
			return input.clamp(min, max)


		@staticmethod
		def backward(ctx, grad_in):
			input, = ctx.saved_tensors
			return grad_in * (grad_in * (input - input.clamp(ctx.min, ctx.max)) >= 0), None, None


	clamp_with_grad = ClampWithGrad.apply


	def vector_quantize(x, codebook):
		d = x.pow(2).sum(dim=-1, keepdim=True) + codebook.pow(2).sum(dim=1) - 2 * x @ codebook.T
		indices = d.argmin(-1)
		x_q = F.one_hot(indices, codebook.shape[0]).to(d.dtype) @ codebook
		return replace_grad(x_q, x)


	class Prompt(nn.Module):
		def __init__(self, embed, weight=1., stop=float('-inf')):
			super().__init__()
			self.register_buffer("embed", embed)
			self.register_buffer("weight", torch.as_tensor(weight))
			self.register_buffer("stop", torch.as_tensor(stop))


		def forward(self, input):
			input_normed = F.normalize(input.unsqueeze(1), dim=2)
			embed_normed = F.normalize(self.embed.unsqueeze(0), dim=2)
			dists = input_normed.sub(embed_normed).norm(dim=2).div(2)\
				.arcsin().pow(2).mul(2)
			dists = dists * self.weight.sign()
			return self.weight.abs() *\
				replace_grad(dists, torch.maximum(dists, self.stop))\
				.mean()


	def parse_prompt(prompt):
		vals = prompt.rsplit(":", 2)
		vals = vals + ["", "1", "-inf"][len(vals):]
		return vals[0], float(vals[1]), float(vals[2])


	class MakeCutouts(nn.Module):
		def __init__(self, cut_size, cutn, cut_pow=1.):
			super().__init__()
			self.cut_size = cut_size
			self.cutn = cutn
			self.cut_pow = cut_pow

			self.augs = nn.Sequential(
				# K.RandomHorizontalFlip(p=0.5),
				# K.RandomVerticalFlip(p=0.5),
				# K.RandomSolarize(0.01, 0.01, p=0.7),
				# K.RandomSharpness(0.3, p=0.4),
				# K.RandomResizedCrop(
				#	size=(self.cut_size, self.cut_size), 
				#	scale=(0.1, 1), ratio=(0.75, 1.333), 
				#	cropping_mode="resample", p=0.5
				# ),
				# K.RandomCrop(
				#	size=(self.cut_size, self.cut_size), p=0.5
				# ),
				K.RandomAffine(
					degrees=15, translate=0.1, p=0.7, 
					padding_mode="border"
				),
				K.RandomPerspective(0.7, p=0.7),
				K.ColorJitter(hue=0.1, saturation=0.1, p=0.7),
				K.RandomErasing(
					(.1, .4), (.3, 1/.3), same_on_batch=True, p=0.7
				),
			)

			self.noise_fac = 0.1
			self.av_pool = nn.AdaptiveAvgPool2d(
				(self.cut_size, self.cut_size)
			)
			self.max_pool = nn.AdaptiveMaxPool2d(
				(self.cut_size, self.cut_size)
			)


		def forward(self, input):
			sideY, sideX = input.shape[2:4]
			max_size = min(sideX, sideY)
			min_size = min(sideX, sideY, self.cut_size)
			cutouts = []

			for _ in range(self.cutn):
				# size = int(
				#	torch.rand([])**self.cut_pow *\
				#	(max_size - min_size) + min_size
				# )
				# offsetx = torch.randint(0, sideX - size + 1, ())
				# offsety = torch.randint(0, sideY - size + 1, ())
				# cutout = input[
				#	:, :, offsety:offsety + size, offsetx:offsetx + size
				# ]
				# cutouts.append(
				#	resample(cutout, (self.cut_size, self.cut_size))
				# )
				# cutout = transforms.Resize(
				#	size=(self.cut_size, self.cut_size)
				# )(input)
				cutout = (self.av_pool(input) + self.max_pool(input)) / 2
				cutouts.append(cutout)
			batch = self.augs(torch.cat(cutouts, dim=0))
			if self.noise_fac:
				facs = batch.new_empty([self.cutn, 1, 1, 1])\
					.uniform_(0, self.noise_fac)
				batch = batch + facs * torch.randn_like(batch)
			return batch


	def load_vqgan_model(config_path, checkpoint_path):
		config = OmegaConf.load(config_path)
		if config.model.target == "taming.models.vqgan.VQModel":
			model = vqgan.VQModel(**config.model.params)
			model.eval().requires_grad_(False)
			model.init_from_ckpt(checkpoint_path)
		elif config.model.target == "taming.models.vqgan.GumbelVQ":
			model = vqgan.GumbelVQ(**config.model.params)
			model.eval().requires_grad_(False)
			model.init_from_ckpt(checkpoint_path)
		elif config.model.target == "taming.models.cond_transformer.Net2NetTransformer":
			parent_model = cond_transformer.Net2NetTransformer(**config.model.params)
			parent_model.eval().requires_grad_(False)
			parent_model.init_from_ckpt(checkpoint_path)
			model = parent_model.first_stage_model
		else:
			raise ValueError(f"unknown model type: {config.model.target}")
		del model.loss
		return model


	def resize_image(image, out_size):
		ratio = image.size[0] / image.size[1]
		area = min(
			image.size[0] * image.size[1], out_size[0] * out_size[1]
		)
		size = round((area * ratio)**0.5), round((area / ratio)**0.5)
		return image.resize(size, Image.LANCZOS)


	# Run settings.
	texts = "Soon I’ll be on the largest screen"
	width = 600
	height = 600
	model = "vqgan_imagenet_f16_16384" # Must match a downloaded model.
	images_interval = 50
	init_image = "" # Image path or url here.
	target_images = "" # Image path here.
	seed = 42
	max_iterations = 200

	model_names = {
		"vqgan_imagenet_f16_16384": 'ImageNet 16384',
		"vqgan_imagenet_f16_1024": "ImageNet 1024", 
		"vqgan_openimages_f16_8192": "OpenImages 8912",
		"wikiart_1024": "WikiArt 1024", 
		"wikiart_16384": "WikiArt 16384", 
		"coco": "COCO-Stuff", 
		"faceshq": "FacesHQ", 
		"sflckr": "S-FLCKR"
	}
	name_model = model_names[model]

	if seed == -1:
		seed = None
	if init_image == None:
		init_image = None
	if target_images == "None" or not target_images:
		target_images = []
	else:
		target_images = target_images.split("|")
		target_images = [image.strip() for image in target_images]

	texts = [phrase.strip() for phrase in texts.split("|")]
	if texts == [""]:
		texts = []

	args = argparse.Namespace(
		prompts=texts,
		image_prompts=target_images,
		noise_prompt_seeds=[],
		noise_prompt_weights=[],
		size=[width, height],
		init_image=init_image,
		init_weight=0.,
		clip_model="ViT-B/32",
		vqgan_config=f"{model}.yaml",
		vqgan_checkpoint=f"{model}.ckpt",
		step_size=0.1,
		cutn=32,
		cut_pow=1.,
		display_freq=images_interval,
		seed=seed,
	)

	# Do the run.
	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print("Using device:", device)
	if texts:
		print("Using texts:", texts)
	if target_images:
		print("Using image prompts:", target_images)
	if args.seed is None:
		seed = torch.seed()
	else:
		seed = args.seed
	torch.manual_seed(seed)
	print("Using seed:", seed)

	model = load_vqgan_model(args.vqgan_config, args.vqgan_checkpoint).to(device)
	perceptor = clip.load(args.clip_model, jit=False)[0].eval().requires_grad_(False).to(device)
	# clock=deepcopy(perceptor.visual.positional_embedding.data)
	# perceptor.visual.positional_embedding.data = clock / clock.max()
	# perceptor.visual.positional_embedding.data = clamp_with_grad(clock, 0, 1)

	cut_size = perceptor.visual.input_resolution

	f = 2**(model.decoder.num_resolutions - 1)
	make_cutouts = MakeCutouts(cut_size, args.cutn, cut_pow=args.cut_pow)

	toksX, toksY = args.size[0] // f, args.size[1] // f
	sideX, sideY = toksX * f, toksY * f

	if args.vqgan_checkpoint == "vqgan_openimages_f16_8192.ckpt":
		e_dim = 256
		n_toks = model.quantize.n_embed
		z_min = model.quantize.embed.weight.min(dim=0).values[None, :, None, None]
		z_max = model.quantize.embed.weight.max(dim=0).values[None, :, None, None]
	else: 
		e_dim = model.quantize.e_dim
		n_toks = model.quantize.n_e
		z_min = model.quantize.embedding.weight.min(dim=0).values[None, :, None, None]
		z_max = model.quantize.embedding.weight.max(dim=0).values[None, :, None, None]
	# z_min = model.quantize.embedding.weight.min(dim=0).values[None, :, None, None]
	# z_max = model.quantize.embedding.weight.max(dim=0).values[None, :, None, None]
	
	# normalize_imagenet = transforms.Normalize(
	# 	mean=[0.485, 0.456, 0.406],
	# 	std=[0.229, 0.224, 0.225],
	# )

	if args.init_image:
		if "http" in args.init_image:
			img = Image.open(urlopen(args.init_image))
		else:
			img = Image.open(args.init_image)
		pil_image = img.convert("RGB")
		pil_image = pil_image.resize((sideX, sideY), Image.LANCZOS)
		pil_tensor = TF.to_tensor(pil_image)
		z, *_ = model.encode(pil_tensor.to(device).unsqueeze(0) * 2 - 1)
	else:
		one_hot = F.one_hot(
			torch.randint(n_toks, [toksY * toksX], device=device),
			n_toks
		).float()
		# z = one_hot @ model.quantize.embedding.weight
		if args.vqgan_checkpoint == "vqgan_openimages_f16_8192.ckpt":
			z = one_hot @ model.quantize.embed.weight
		else:
			z = one_hot @ model.quantize.embedding.weight
		z = z.view([-1, toksY, toksX, e_dim]).permute(0, 3, 1, 2)
		z = torch.rand_like(z) * 2
	z_orig = z.clone()
	z.requires_grad_(True)
	opt = optim.Adam([z], lr=args.step_size)

	normalize = transforms.Normalize(
		mean=[0.48145466, 0.4578275, 0.40821073],
		std=[0.26862954, 0.26130258, 0.27577711],
	)

	pMs = []

	for prompt in args.prompts:
		txt, weight, stop = parse_prompt(prompt)
		embed = perceptor.encode_text(clip.tokenize(txt).to(device)).float()
		pMs.append(Prompt(embed, weight, stop).to(device))

	for prompt in args.image_prompts:
		path, weight, stop = parse_prompt(prompt)
		img = Image.open(path)
		pil_image = img.convert("RGB")
		img = resize_image(pil_image, (sideX, sideY))
		batch = make_cutouts(TF.to_tensor(img).unsqueeze(0).to(device))
		embed = perceptor.encode_image(normalize(batch)).float()
		pMs.append(Prompt(embed, weight, stop).to(device))

	for seed, weight in zip(args.noise_prompt_seeds, args.noise_prompt_weights):
		gen = torch.Generator().manual_seed(seed)
		embed = torch.empty(
			[1, perceptor.visual.output_dim]
		).normal_(generator=gen)
		pMs.append(Prompt(embed, weight).to(device))


	def synth(z):
		if args.vqgan_checkpoint == "vqgan_openimages_f16_8192.ckpt":
			z_q = vector_quantize(
				z.movedim(1, 3), model.quantize.embed.weight
			).movedim(3, 1)
		else:
			z_q = vector_quantize(
				z.movedim(1, 3), model.quantize.embedding.weight
			).movedim(3, 1)
		return clamp_with_grad(model.decode(z_q).add(1).div(2), 0, 1)


	@torch.no_grad()
	def checkin(i, losses):
		losses_str = ", ".join(f"{loss.item():g}" for loss in losses)
		tqdm.write(
			f"i: {i}, loss: {sum(losses).item():g}, losses: {losses_str}"
		)
		out = synth(z)
		TF.to_pil_image(out[0].cpu()).save("progress.png")
		# MAY UNCOMMENT WHEN NOT RUNNING ON DOCKER.
		#display.display(display.Image("progress.png"))


	def ascend_txt(i):
		#global i
		out = synth(z)
		iii = perceptor.encode_image(normalize(make_cutouts(out))).float()

		result = []

		if args.init_weight:
			# result.append(F.mse_loss(z, z_orig) * args.init_weight / 2)
			result.append(
				F.mse_loss(z, torch.zeros_like(z_orig)) *\
				((1 / torch.tensor(i * 2 + 1)) * args.init_weight)
			)
		for prompt in pMs:
			result.append(prompt(iii))
		img = np.array(
			out.mul(255).clamp(0, 255)[0].cpu().detach().numpy()\
			.astype(np.uint8)
		)[:, :, :]
		img = np.transpose(img, (1, 2, 0))
		imageio.imwrite("./steps/" + str(i) + ".png", np.array(img))
		return result


	def train(i):
		opt.zero_grad()
		lossAll = ascend_txt(i)
		if i % args.display_freq == 0:
			checkin(i, lossAll)

		loss = sum(lossAll)
		loss.backward()
		opt.step()
		with torch.no_grad():
			z.copy_(z.maximum(z_min).minimum(z_max))


	i = 0
	try:
		with tqdm() as pbar:
			while True:
				train(i)
				if i == max_iterations:
					break
				i += 1
				pbar.update()
	except KeyboardInterrupt:
		pass

	# Generate a video with the result.
	init_frame = 1 # This is the frame where the video will start.
	last_frame = i # Can change i to the number of the last frame wanted
	# to generate. Will raise an error if it does not exist.

	min_fps = 10
	max_fps = 60

	total_frames = last_frame - init_frame

	length = 15 # Desiredtime of the video in seconds.

	frames = []
	tqdm.write("Generating video...")
	for i in range(init_frame, last_frame):
		frames.append(Image.open("./steps/" + str(i) + ".png"))

	# fps = last_frame / 10
	fps = np.clip(total_frames / length, min_fps, max_fps)

	# MAY UNCOMMENT WHEN NOT RUNNING ON DOCKER.
	#p = Popen(
	#	["ffmpeg", "-y", "-f", "image2pipe", "-vcodec", "png", "-r", 
	#	str(fps), "-i", "-", "-vcodec", "libx264", "-r", str(fps),
	#	"-pix_fmt", "yuv420p", "-crf", "17", "-preset", "veryslow",
	#	"video.mp4"],
	#	stdin=PIPE
	#)
	#for im in tqdm(frames):
	#	im.save(p.stdin, "PNG")
	#p.stdin.close()
	#p.wait()
	#mp4 = open("video/mp4", "rb").read()
	#data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
	#display.HTML('''
	#	<video width=400 controls>
	#		<source src="%s" type="video/mp4">
	#	</video>
	#''' % data_url)

	# Exit the program.
	exit(0)