Beispiel #1
0
def main():
    args = get_args()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    clip_model, _ = clip.load(args.model)
    print(f"Using model {args.model}")

    input_text = args.input_text
    print(f"Generating from '{input_text}'")

    out_name_list = []
    out_name_list.append(txt_clean(input_text))
    out_name = '-'.join(out_name_list)
    out_name += '-%s' % args.model if 'RN' in args.model.upper() else ''

    tempdir = os.path.join(args.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    tokenized_text = clip.tokenize([input_text]).to(device).detach().clone()
    text_logits = clip_model.encode_text(tokenized_text)

    num_channels = 3
    spectrum_size = [args.batch_size, num_channels, *args.size]
    fft_img, img_freqs = get_fft_img(
        spectrum_size,
        std=0.01,
        return_img_freqs=True,
    )

    fft_img = fft_img.to(device)
    fft_img.requires_grad = True

    scale = get_scale_from_img_freqs(
        img_freqs=img_freqs,
        decay_power=args.decay,
    )

    scale = scale.to(device)

    shift = None
    if args.noise > 0:
        img_size = img_freqs.shape
        noise_size = (1, 1, *img_size, 1)
        shift = self.noise * torch.randn(noise_size, ).to(self.device)

    optimizer = torch.optim.Adam(
        [fft_img],
        args.lrate,
    )

    sign = -1

    pbar = ProgressBar(args.num_steps // args.save_freq)

    num_steps = args.num_steps
    num_crops = 200
    crop_size = 224

    for step in range(num_steps):
        loss = 0

        initial_img = fft_to_rgb(
            fft_img=fft_img,
            scale=scale,
            img_size=args.size,
            shift=shift,
            contrast=1.0,
            decorrelate=True,
            device=device,
        )

        crop_img_out = random_crop(
            initial_img,
            num_crops,
            crop_size,
            normalize=True,
        )
        img_logits = clip_model.encode_image(crop_img_out).to(device)
        tokenized_text = clip.tokenize([input_text]).to(device)
        text_logits = clip_model.encode_text(tokenized_text)

        loss += -torch.cosine_similarity(
            text_logits,
            img_logits,
            dim=-1,
        ).mean()

        torch.cuda.empty_cache()

        # if self.prog is True:
        #     lr_cur = lr + (step / self.steps) * (init_lr - lr)
        #     for g in self.optimizer.param_groups:
        #         g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % args.save_freq == 0:
            with torch.no_grad():
                img = fft_to_rgb(
                    fft_img=fft_img,
                    scale=scale,
                    img_size=args.size,
                    shift=shift,
                    contrast=1.0,
                    decorrelate=True,
                    device=device,
                )
                img = img.cpu().numpy()

            img_out_path = os.path.join(tempdir,
                                        '%04d.jpg' % (step // args.save_freq))
            checkout(
                img[0],
                img_out_path,
            )

            if pbar is not None:
                pbar.upd()

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(args.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(out_dir, '%s-%d.jpg' % (out_name, num_steps)))

    if args.save_pt is True:
        torch.save(fft_img, '%s.pt' % os.path.join(out_dir, out_name))
Beispiel #2
0
import clip
from tqdm.notebook import tqdm
import numpy as np
import torch
import clip
import torch.nn as nn

from tqdm.notebook import tqdm
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
print("Torch version:", torch.__version__)
parser.add_argument('--attack', default=None, 
                    help='attack type ')
#assert torch.__version__.split(".") >= ["1", "7", "1"], "PyTorch 1.7.1 or later is required"


model, preprocess = clip.load("RN50")
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size
args = parser.parse_args()
print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)
imagenet_classes = ["tench", "goldfish", "great white shark", "tiger shark", "hammerhead shark", "electric ray", "stingray", "rooster", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "American robin", "bulbul", "jay", "magpie", "chickadee", "American dipper", "kite (bird of prey)", "bald eagle", "vulture", "great grey owl", "fire salamander", "smooth newt", "newt", "spotted salamander", "axolotl", "American bullfrog", "tree frog", "tailed frog", "loggerhead sea turtle", "leatherback sea turtle", "mud turtle", "terrapin", "box turtle", "banded gecko", "green iguana", "Carolina anole", "desert grassland whiptail lizard", "agama", "frilled-necked lizard", "alligator lizard", "Gila monster", "European green lizard", "chameleon", "Komodo dragon", "Nile crocodile", "American alligator", "triceratops", "worm snake", "ring-necked snake", "eastern hog-nosed snake", "smooth green snake", "kingsnake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "African rock python", "Indian cobra", "green mamba", "sea snake", "Saharan horned viper", "eastern diamondback rattlesnake", "sidewinder rattlesnake", "trilobite", "harvestman", "scorpion", "yellow garden spider", "barn spider", "European garden spider", "southern black widow", "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie grouse", "peafowl", "quail", "partridge", "african grey parrot", "macaw", "sulphur-crested cockatoo", "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "duck", "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "red king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron", "great egret", "bittern bird", "crane bird", "limpkin", "common gallinule", "American coot", "bustard", "ruddy turnstone", "dunlin", "common redshank", "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion", "Chihuahua", "Japanese Chin", "Maltese", "Pekingese", "Shih Tzu", "King Charles Spaniel", "Papillon", "toy terrier", "Rhodesian Ridgeback", "Afghan Hound", "Basset Hound", "Beagle", "Bloodhound", "Bluetick Coonhound", "Black and Tan Coonhound", "Treeing Walker Coonhound", "English foxhound", "Redbone Coonhound", "borzoi", "Irish Wolfhound", "Italian Greyhound", "Whippet", "Ibizan Hound", "Norwegian Elkhound", "Otterhound", "Saluki", "Scottish Deerhound", "Weimaraner", "Staffordshire Bull Terrier", "American Staffordshire Terrier", "Bedlington Terrier", "Border Terrier", "Kerry Blue Terrier", "Irish Terrier", "Norfolk Terrier", "Norwich Terrier", "Yorkshire Terrier", "Wire Fox Terrier", "Lakeland Terrier", "Sealyham Terrier", "Airedale Terrier", "Cairn Terrier", "Australian Terrier", "Dandie Dinmont Terrier", "Boston Terrier", "Miniature Schnauzer", "Giant Schnauzer", "Standard Schnauzer", "Scottish Terrier", "Tibetan Terrier", "Australian Silky Terrier", "Soft-coated Wheaten Terrier", "West Highland White Terrier", "Lhasa Apso", "Flat-Coated Retriever", "Curly-coated Retriever", "Golden Retriever", "Labrador Retriever", "Chesapeake Bay Retriever", "German Shorthaired Pointer", "Vizsla", "English Setter", "Irish Setter", "Gordon Setter", "Brittany dog", "Clumber Spaniel", "English Springer Spaniel", "Welsh Springer Spaniel", "Cocker Spaniel", "Sussex Spaniel", "Irish Water Spaniel", "Kuvasz", "Schipperke", "Groenendael dog", "Malinois", "Briard", "Australian Kelpie", "Komondor", "Old English Sheepdog", "Shetland Sheepdog", "collie", "Border Collie", "Bouvier des Flandres dog", "Rottweiler", "German Shepherd Dog", "Dobermann", "Miniature Pinscher", "Greater Swiss Mountain Dog", "Bernese Mountain Dog", "Appenzeller Sennenhund", "Entlebucher Sennenhund", "Boxer", "Bullmastiff", "Tibetan Mastiff", "French Bulldog", "Great Dane", "St. Bernard", "husky", "Alaskan Malamute", "Siberian Husky", "Dalmatian", "Affenpinscher", "Basenji", "pug", "Leonberger", "Newfoundland dog", "Great Pyrenees dog", "Samoyed", "Pomeranian", "Chow Chow", "Keeshond", "brussels griffon", "Pembroke Welsh Corgi", "Cardigan Welsh Corgi", "Toy Poodle", "Miniature Poodle", "Standard Poodle", "Mexican hairless dog (xoloitzcuintli)", "grey wolf", "Alaskan tundra wolf", "red wolf or maned wolf", "coyote", "dingo", "dhole", "African wild dog", "hyena", "red fox", "kit fox", "Arctic fox", "grey fox", "tabby cat", "tiger cat", "Persian cat", "Siamese cat", "Egyptian Mau", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah", "brown bear", "American black bear", "polar bear", "sloth bear", "mongoose", "meerkat", "tiger beetle", "ladybug", "ground beetle", "longhorn beetle", "leaf beetle", "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper", "cricket insect", "stick insect", "cockroach", "praying mantis", "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "red admiral butterfly", "ringlet butterfly", "monarch butterfly", "small white butterfly", "sulphur butterfly", "gossamer-winged butterfly", "starfish", "sea urchin", "sea cucumber", "cottontail rabbit", "hare", "Angora rabbit", "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "common sorrel horse", "zebra", "pig", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison", "ram (adult male sheep)", "bighorn sheep", "Alpine ibex", "hartebeest", "impala (antelope)", "gazelle", "arabian camel", "llama", "weasel", "mink", "European polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas monkey", "baboon", "macaque", "langur", "black-and-white colobus", "proboscis monkey", "marmoset", "white-headed capuchin", "howler monkey", "titi monkey", "Geoffroy's spider monkey", "common squirrel monkey", "ring-tailed lemur", "indri", "Asian elephant", "African bush elephant", "red panda", "giant panda", "snoek fish", "eel", "silver salmon", "rock beauty fish", "clownfish", "sturgeon", "gar fish", "lionfish", "pufferfish", "abacus", "abaya", "academic gown", "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance", "amphibious vehicle", "analog clock", "apiary", "apron", "trash can", "assault rifle", "backpack", "bakery", "balance beam", "balloon", "ballpoint pen", "Band-Aid", "banjo", "baluster / handrail", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel", "wheelbarrow", "baseball", "basketball", "bassinet", "bassoon", "swimming cap", "bath towel", "bathtub", "station wagon", "lighthouse", "beaker", "military hat (bearskin or shako)", "beer bottle", "beer glass", "bell tower", "baby bib", "tandem bicycle", "bikini", "ring binder", "binoculars", "birdhouse", "boathouse", "bobsleigh", "bolo tie", "poke bonnet", "bookcase", "bookstore", "bottle cap", "hunting bow", "bow tie", "brass memorial plaque", "bra", "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest", "high-speed train", "butcher shop", "taxicab", "cauldron", "candle", "cannon", "canoe", "can opener", "cardigan", "car mirror", "carousel", "tool kit", "cardboard box / carton", "car wheel", "automated teller machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", "mobile phone", "chain", "chain-link fence", "chain mail", "chainsaw", "storage chest", "chiffonier", "bell or wind chime", "china cabinet", "Christmas stocking", "church", "movie theater", "cleaver", "cliff dwelling", "cloak", "clogs", "cocktail shaker", "coffee mug", "coffeemaker", "spiral or coil", "combination lock", "computer keyboard", "candy store", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot", "cowboy hat", "cradle", "construction crane", "crash helmet", "crate", "infant bed", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "rotary dial telephone", "diaper", "digital clock", "digital watch", "dining table", "dishcloth", "dishwasher", "disc brake", "dock", "dog sled", "dome", "doormat", "drilling rig", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center", "envelope", "espresso machine", "face powder", "feather boa", "filing cabinet", "fireboat", "fire truck", "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster bed", "freight car", "French horn", "frying pan", "fur coat", "garbage truck", "gas mask or respirator", "gas pump", "goblet", "go-kart", "golf ball", "golf cart", "gondola", "gong", "gown", "grand piano", "greenhouse", "radiator grille", "grocery store", "guillotine", "hair clip", "hair spray", "half-track", "hammer", "hamper", "hair dryer", "hand-held computer", "handkerchief", "hard disk drive", "harmonica", "harp", "combine harvester", "hatchet", "holster", "home theater", "honeycomb", "hook", "hoop skirt", "gymnastic horizontal bar", "horse-drawn vehicle", "hourglass", "iPod", "clothes iron", "carved pumpkin", "jeans", "jeep", "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat", "ladle", "lampshade", "laptop computer", "lawn mower", "lens cap", "letter opener", "library", "lifeboat", "lighter", "limousine", "ocean liner", "lipstick", "slip-on shoe", "lotion", "music speaker", "loupe magnifying glass", "sawmill", "magnetic compass", "messenger bag", "mailbox", "tights", "one-piece bathing suit", "manhole cover", "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine cabinet", "megalith", "microphone", "microwave oven", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile", "mitten", "mixing bowl", "mobile home", "ford model t", "modem", "monastery", "monitor", "moped", "mortar and pestle", "graduation cap", "mosque", "mosquito net", "vespa", "mountain bike", "tent", "computer mouse", "mousetrap", "moving van", "muzzle", "metal nail", "neck brace", "necklace", "baby pacifier", "notebook computer", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "pipe organ", "oscilloscope", "overskirt", "bullock cart", "oxygen mask", "product packet / packaging", "paddle", "paddle wheel", "padlock", "paintbrush", "pajamas", "palace", "pan flute", "paper towel", "parachute", "parallel bars", "park bench", "parking meter", "railroad car", "patio", "payphone", "pedestal", "pencil case", "pencil sharpener", "perfume", "Petri dish", "photocopier", "plectrum", "Pickelhaube", "picket fence", "pickup truck", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate ship", "drink pitcher", "block plane", "planetarium", "plastic bag", "plate rack", "farm plow", "plunger", "Polaroid camera", "pole", "police van", "poncho", "pool table", "soda bottle", "plant pot", "potter's wheel", "power drill", "prayer rug", "printer", "prison", "missile", "projector", "hockey puck", "punching bag", "purse", "quill", "quilt", "race car", "racket", "radiator", "radio", "radio telescope", "rain barrel", "recreational vehicle", "fishing casting reel", "reflex camera", "refrigerator", "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "eraser", "rugby ball", "ruler measuring stick", "sneaker", "safe", "safety pin", "salt shaker", "sandal", "sarong", "saxophone", "scabbard", "weighing scale", "school bus", "schooner", "scoreboard", "CRT monitor", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe store", "shoji screen / room divider", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", "ski", "balaclava ski mask", "sleeping bag", "slide rule", "sliding door", "slot machine", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar thermal collector", "sombrero", "soup bowl", "keyboard space bar", "space heater", "space shuttle", "spatula", "motorboat", "spider web", "spindle", "sports car", "spotlight", "stage", "steam locomotive", "through arch bridge", "steel drum", "stethoscope", "scarf", "stone wall", "stopwatch", "stove", "strainer", "tram", "stretcher", "couch", "stupa", "submarine", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "suspension bridge", "mop", "sweatshirt", "swim trunks / shorts", "swing", "electrical switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy bear", "television", "tennis ball", "thatched roof", "front curtain", "thimble", "threshing machine", "throne", "tile roof", "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toy store", "tractor", "semi-trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "hot tub", "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright piano", "vacuum cleaner", "vase", "vaulted or arched ceiling", "velvet fabric", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "military aircraft", "sink", "washing machine", "water bottle", "water jug", "water tower", "whiskey jug", "whistle", "hair wig", "window screen", "window shade", "Windsor tie", "wine bottle", "airplane wing", "wok", "wooden spoon", "wool", "split-rail fence", "shipwreck", "sailboat", "yurt", "website", "comic book", "crossword", "traffic or street sign", "traffic light", "dust jacket", "menu", "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "popsicle", "baguette", "bagel", "pretzel", "cheeseburger", "hot dog", "mashed potatoes", "cabbage", "broccoli", "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith apple", "strawberry", "orange", "lemon", "fig", "pineapple", "banana", "jackfruit", "cherimoya (custard apple)", "pomegranate", "hay", "carbonara", "chocolate syrup", "dough", "meatloaf", "pizza", "pot pie", "burrito", "red wine", "espresso", "tea cup", "eggnog", "mountain", "bubble", "cliff", "coral reef", "geyser", "lakeshore", "promontory", "sandbar", "beach", "valley", "volcano", "baseball player", "bridegroom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn", "rose hip", "horse chestnut seed", "coral fungus", "agaric", "gyromitra", "stinkhorn mushroom", "earth star fungus", "hen of the woods mushroom", "bolete", "corn cob", "toilet paper"]

imagenet_templates = [
    'a bad photo of a {}.',
    'a photo of many {}.',
    'a sculpture of a {}.',
    'a photo of the hard to see {}.',
Beispiel #3
0
def main():
    a = get_args()

    prev_enc = 0

    def train(i):
        loss = 0

        noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                      1).cuda() if a.noise > 0 else None
        img_out = image_f(noise)

        micro = None if a.in_txt2 is None else False
        imgs_sliced = slice_imgs([img_out],
                                 a.samples,
                                 a.modsize,
                                 norm_in,
                                 a.overscan,
                                 micro=micro)
        out_enc = model_clip.encode_image(imgs_sliced[-1])
        if a.diverse != 0:
            imgs_sliced = slice_imgs([image_f(noise)],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=micro)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += a.diverse * torch.cosine_similarity(
                out_enc, out_enc2, dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.in_img is not None and os.path.isfile(a.in_img):  # input image
            loss += sign * 0.5 * torch.cosine_similarity(
                img_enc, out_enc, dim=-1).mean()
        if a.in_txt is not None:  # input text
            loss += sign * torch.cosine_similarity(txt_enc, out_enc,
                                                   dim=-1).mean()
            if a.notext > 0:
                loss -= sign * a.notext * torch.cosine_similarity(
                    txt_plot_enc, out_enc, dim=-1).mean()
        if a.in_txt0 is not None:  # subtract text
            loss += -sign * torch.cosine_similarity(txt_enc0, out_enc,
                                                    dim=-1).mean()
        if a.sync > 0 and a.in_img is not None and os.path.isfile(
                a.in_img):  # image composition
            loss -= a.sync * ssim_loss(
                F.interpolate(img_out, ssim_size).float(), img_in)
        if a.in_txt2 is not None:  # input text for micro details
            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=True)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += sign * torch.cosine_similarity(txt_enc2, out_enc2,
                                                   dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.expand > 0:
            global prev_enc
            if i > 0:
                loss += a.expand * torch.cosine_similarity(
                    out_enc, prev_enc, dim=-1).mean()
            prev_enc = out_enc.detach()

        del img_out, imgs_sliced, out_enc
        torch.cuda.empty_cache()
        assert not isinstance(loss, int), ' Loss not defined, check the inputs'

        if a.prog is True:
            lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
            for g in optimizer.param_groups:
                g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % a.fstep == 0:
            with torch.no_grad():
                img = image_f(contrast=a.contrast).cpu().numpy()[0]
            checkout(img,
                     os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                     verbose=a.verbose)
            pbar.upd()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])

    if a.multilang is True:
        model_lang = SentenceTransformer(
            'clip-ViT-B-32-multilingual-v1').cuda()

    def enc_text(txt):
        if a.multilang is True:
            emb = model_lang.encode([txt],
                                    convert_to_tensor=True,
                                    show_progress_bar=False)
        else:
            emb = model_clip.encode_text(clip.tokenize(txt).cuda())
        return emb.detach().clone()

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)

    norm_in = torchvision.transforms.Normalize(
        (0.48145466, 0.4578275, 0.40821073),
        (0.26862954, 0.26130258, 0.27577711))

    out_name = []
    if a.in_txt is not None:
        if a.verbose is True: print(' ref text: ', basename(a.in_txt))
        if a.translate:
            translator = Translator()
            a.in_txt = translator.translate(a.in_txt, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt)
        txt_enc = enc_text(a.in_txt)
        out_name.append(txt_clean(a.in_txt))

        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) /
                                        255.).unsqueeze(0).permute(0, 3, 1,
                                                                   2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()

    if a.in_txt2 is not None:
        if a.verbose is True: print(' micro text:', basename(a.in_txt2))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt2 = translator.translate(a.in_txt2, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt2)
        txt_enc2 = enc_text(a.in_txt2)
        out_name.append(txt_clean(a.in_txt2))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0)
        txt_enc0 = enc_text(a.in_txt0)
        out_name.append('off-' + txt_clean(a.in_txt0))

    if a.multilang is True: del model_lang

    if a.in_img is not None and os.path.isfile(a.in_img):
        if a.verbose is True: print(' ref image:', basename(a.in_img))
        img_in = torch.from_numpy(
            img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda()
        img_in = img_in[:, :3, :, :]  # fix rgb channels
        in_sliced = slice_imgs([img_in],
                               a.samples,
                               a.modsize,
                               transform=norm_in,
                               overscan=a.overscan)[0]
        img_enc = model_clip.encode_image(in_sliced).detach().clone()
        if a.sync > 0:
            ssim_loss = ssim.SSIM(window_size=11)
            ssim_size = [s // 8 for s in a.size]
            img_in = F.interpolate(img_in, ssim_size).float()
        else:
            del img_in
        del in_sliced
        torch.cuda.empty_cache()
        out_name.append(basename(a.in_img).replace(' ', '_'))

    params, image_f = fft_image([1, 3, *a.size],
                                resume=a.resume,
                                decay_power=a.decay)
    image_f = to_valid_rgb(image_f, colors=a.colors)

    if a.prog is True:
        lr1 = a.lrate * 2
        lr0 = lr1 * 0.01
    else:
        lr0 = a.lrate
    optimizer = torch.optim.Adam(params, lr0)
    sign = 1. if a.invert is True else -1.

    if a.verbose is True: print(' samples:', a.samples)
    out_name = '-'.join(out_name)
    out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
    tempdir = os.path.join(a.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    pbar = ProgressBar(a.steps // a.fstep)
    for i in range(a.steps):
        train(i)

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(a.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps)))
    if a.save_pt is True:
        torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
Beispiel #4
0
import sys
import numpy as np
import lmdb
import torch
import clip
import faiss
from PIL import Image

# Maximum size of (sparse) LMDB file
map_size = 1024*1024*1024*20

# If you run out of RAM building a #Images * 512 floats numpy array, set this to build sequential 20k * 512 floats tables instead (might disimprove search quality)
split_table = False 

device = "cuda" if torch.cuda.is_available() else "cpu"
model, transform = clip.load("ViT-B/32", device=device, jit=False)

model.eval()

env = lmdb.open('vectors.lmdb', map_size=map_size, max_dbs=4)
fn_db = env.open_db("fn_db".encode())
skip_db = env.open_db("skip_db".encode())

try:
    with torch.no_grad():
        for base_path in sys.argv[1:]:
            print(f"CLIPing {base_path}...")
            for fn in os.listdir(base_path):
                tfn = base_path + fn
                ext = os.path.splitext(fn)
                if len(ext) < 2 or not ext[1].lower() in ['.jpg', '.jpeg', '.png']:
Beispiel #5
0
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50", device=device, jit=False)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]
def caption_image_beam_search(encoder,
                              decoder,
                              image_path,
                              word_map,
                              beam_size=3,
                              use_clip=True,
                              clip_beam_search=True):
    """
    Reads an image and captions it with beam search.

    :param encoder: encoder model
    :param decoder: decoder model
    :param image_path: path to image
    :param word_map: word map
    :param beam_size: number of sequences to consider at each decode-step
    :return: caption, weights for visualization
    """

    k = beam_size
    vocab_size = len(word_map)

    # Read image and process
    img = imread(image_path)
    if len(img.shape) == 2:
        img = img[:, :, np.newaxis]
        img = np.concatenate([img, img, img], axis=2)
    img = img_as_ubyte(imresize(img, (256, 256)))
    img = img.transpose(2, 0, 1)
    img = img / 255.
    img = torch.FloatTensor(img).to(device)
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    _transforms = [normalize]
    if use_clip:
        _, preprocess = clip.load('ViT-B/32')
        preprocess.transforms = preprocess.transforms[:2]
        _transforms = preprocess.transforms + _transforms
    _transforms = transforms.Compose(_transforms)
    image = _transforms(img)  # (3, 256, 256)

    # Encode
    image = image.unsqueeze(0)  # (1, 3, 256, 256)
    encoder_out = encoder(
        image)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(1, -1,
                                   encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)

    # We'll treat the problem as having a batch size of k
    encoder_out = encoder_out.expand(
        k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

    # Tensor to store top k previous words at each step; now they're just <start>
    k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
        device)  # (k, 1)

    # Tensor to store top k sequences; now they're just <start>
    seqs = k_prev_words  # (k, 1)

    # Tensor to store top k sequences' scores; now they're just 0
    top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

    # Tensor to store top k sequences' alphas; now they're just 1s
    seqs_alpha = torch.ones(k, 1, enc_image_size, enc_image_size).to(
        device)  # (k, 1, enc_image_size, enc_image_size)

    # Lists to store completed sequences, their alphas and scores
    complete_seqs = list()
    complete_seqs_alpha = list()
    complete_seqs_scores = list()

    # Start decoding
    step = 1
    h, c = decoder.init_hidden_state(encoder_out)

    rev_word_map = {v: k for k, v in word_map.items()}
    if clip_beam_search:
        with torch.no_grad():
            image_features = encoder.clip_model.encode_image(image)
            image_features /= image_features.norm(dim=-1, keepdim=True)

    def get_clip_scores(seqs, scores):
        nonlocal top_k_scores
        special_words = ['<start>', '<end>']
        replace_words = {
            '<unk>': '<averyunpleasantword>',
            '<pad>': '<anotherveryunpleasantword>'
        }
        special_words_enc = [word_map[w] for w in special_words]
        if step == 1:
            top_k_scores, next_word_inds = scores[0].topk(k, 0, True,
                                                          True)  # (s)
            return torch.zeros(k, device=device).long(), next_word_inds
        next_word_inds = scores.topk(k)[1]
        inds = []

        text = []
        weights = torch.ones(k**2).to(device)
        count = 0
        for idx, (prev_seq, next_words) in enumerate(
                zip(seqs.tolist(), next_word_inds.tolist())):
            prev_words = [
                rev_word_map[w] for w in prev_seq if w not in special_words_enc
            ]
            for word in next_words:
                cap_words = copy.copy(prev_words)
                if word not in special_words:
                    word_char = rev_word_map[word]
                    word_char = replace_words.get(word_char) or word_char
                    cap_words.append(word_char)
                text.append(' '.join(cap_words))
                inds.append([idx, word])
                if rev_word_map[word] == '<end>':
                    weights[count] = 1.5
                count += 1
        inds = np.array(inds)
        text = clip.tokenize(text).to(device)
        with torch.no_grad():
            text_features = encoder.clip_model.encode_text(text)

        # Pick the top k most similar captions for the image
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (image_features @ text_features.T *
                      weights).log_softmax(dim=-1)
        top_k_scores, indices = similarity.view(-1).topk(k, 0, True, True)
        prev_inds = torch.tensor([inds[idx][0] for idx in indices],
                                 device=device)
        next_inds = torch.tensor([inds[idx][1] for idx in indices],
                                 device=device)

        return prev_inds, next_inds

    # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
    while True:

        embeddings = decoder.embedding(k_prev_words).squeeze(
            1)  # (s, embed_dim)

        awe, alpha = decoder.attention(encoder_out,
                                       h)  # (s, encoder_dim), (s, num_pixels)

        alpha = alpha.view(
            -1, enc_image_size,
            enc_image_size)  # (s, enc_image_size, enc_image_size)

        gate = decoder.sigmoid(
            decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe

        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                   (h, c))  # (s, decoder_dim)

        scores = decoder.fc(h)  # (s, vocab_size)
        scores = F.log_softmax(scores, dim=1)

        # Add
        scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)
        if clip_beam_search:
            prev_word_inds, next_word_inds = get_clip_scores(seqs, scores)
        else:
            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True,
                                                           True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(
                    k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = (top_k_words / vocab_size).long()  # (s)
            next_word_inds = (top_k_words % vocab_size).long()  # (s)

        # Add new words to sequences, alphas
        seqs = torch.cat([seqs[prev_word_inds],
                          next_word_inds.unsqueeze(1)],
                         dim=1)  # (s, step+1)
        seqs_alpha = torch.cat(
            [seqs_alpha[prev_word_inds], alpha[prev_word_inds].unsqueeze(1)],
            dim=1)  # (s, step+1, enc_image_size, enc_image_size)

        # Which sequences are incomplete (didn't reach <end>)?
        incomplete_inds = [
            ind for ind, next_word in enumerate(next_word_inds)
            if next_word != word_map['<end>']
        ]
        complete_inds = list(
            set(range(len(next_word_inds))) - set(incomplete_inds))

        # Set aside complete sequences
        if len(complete_inds) > 0:
            complete_seqs.extend(seqs[complete_inds].tolist())
            complete_seqs_alpha.extend(seqs_alpha[complete_inds].tolist())
            complete_seqs_scores.extend(top_k_scores[complete_inds])
        k -= len(complete_inds)  # reduce beam length accordingly

        # Proceed with incomplete sequences
        if k == 0:
            break
        seqs = seqs[incomplete_inds]
        seqs_alpha = seqs_alpha[incomplete_inds]
        h = h[prev_word_inds[incomplete_inds]]
        c = c[prev_word_inds[incomplete_inds]]
        encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
        top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
        k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

        # Break if things have been going on too long
        if step > 50:
            break
        step += 1

    if len(complete_inds) > 0:
        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]
        alphas = complete_seqs_alpha[i]
    else:
        i = top_k_scores.argmax().item()
        seq = seqs[i].tolist()
        alphas = seqs_alpha[i].tolist()

    return seq, alphas