text_seq_len=TEXTSEQLEN, # text sequence length depth=12, # should aim to be 64 heads=16, # attention heads dim_head=64, # attention head dimension attn_dropout=0.1, # attention dropout ff_dropout=0.1 # feedforward dropout ).cuda() dalle.load_state_dict(torch.load("dalle-small.pth")) """ text = torch.randint(0, NUM_TOKENS, (BATCH_SIZE, TEXTSEQLEN)) images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE) mask = torch.ones_like(text).bool() """ tokenDset = token_dataset('./coco/merged-smallsample.txt') # do the above for a long time with a lot of data ... then num_pics = 30 def denorm(img: torch.Tensor): mean = torch.mean(img) min_maxrange = (torch.max(img) - torch.min(img)) return (((img - mean) / (min_maxrange) + 0.5) * 255) for i in range(30): test_text = "犬が地面に寝そべっている写真"
images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE) mask = torch.ones_like(text).bool() """ cap = dset.CocoCaptions( root='./coco/images', annFile='./coco/annotations/captions_val2014.json', transform=transforms.Compose([ #transforms.RandomCrop((IMAGE_SIZE,IMAGE_SIZE),pad_if_needed=True), #transforms.Grayscale(), transforms.Resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])) tokenDset = token_dataset('./coco/merged-1000.txt') VAEloss = [] for epoch in range(EPOCHS): for i in range(DATASET_SIZE): #print(i,":",tokenDset.getRand(i),img.size()) optimizerVAE.zero_grad() img, _ = cap[i] img = img.unsqueeze(0).cuda() #print(img.size()) if i % 10 == 0: print("VAE epoch {} / {}".format(i + epoch * DATASET_SIZE, EPOCHS * DATASET_SIZE)) loss = vae(img, return_recon_loss=True) VAEloss.append(loss.cpu().detach().numpy())