Ejemplo n.º 1
0
    text_seq_len=TEXTSEQLEN,  # text sequence length
    depth=12,  # should aim to be 64
    heads=16,  # attention heads
    dim_head=64,  # attention head dimension
    attn_dropout=0.1,  # attention dropout
    ff_dropout=0.1  # feedforward dropout
).cuda()

dalle.load_state_dict(torch.load("dalle-small.pth"))
"""
text = torch.randint(0, NUM_TOKENS, (BATCH_SIZE, TEXTSEQLEN))
images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE)
mask = torch.ones_like(text).bool()
"""

tokenDset = token_dataset('./coco/merged-smallsample.txt')

# do the above for a long time with a lot of data ... then

num_pics = 30


def denorm(img: torch.Tensor):
    mean = torch.mean(img)
    min_maxrange = (torch.max(img) - torch.min(img))
    return (((img - mean) / (min_maxrange) + 0.5) * 255)


for i in range(30):

    test_text = "犬が地面に寝そべっている写真"
Ejemplo n.º 2
0
images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE)
mask = torch.ones_like(text).bool()
"""

cap = dset.CocoCaptions(
    root='./coco/images',
    annFile='./coco/annotations/captions_val2014.json',
    transform=transforms.Compose([
        #transforms.RandomCrop((IMAGE_SIZE,IMAGE_SIZE),pad_if_needed=True),
        #transforms.Grayscale(),
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]))

tokenDset = token_dataset('./coco/merged-1000.txt')

VAEloss = []

for epoch in range(EPOCHS):
    for i in range(DATASET_SIZE):
        #print(i,":",tokenDset.getRand(i),img.size())
        optimizerVAE.zero_grad()
        img, _ = cap[i]
        img = img.unsqueeze(0).cuda()
        #print(img.size())
        if i % 10 == 0:
            print("VAE epoch {} / {}".format(i + epoch * DATASET_SIZE,
                                             EPOCHS * DATASET_SIZE))
        loss = vae(img, return_recon_loss=True)
        VAEloss.append(loss.cpu().detach().numpy())