def build_flow(args, device): # Model print('Building flow model..') flow_net = flow.Glow(num_channels=args.num_channels, num_levels=args.num_levels, num_steps=args.num_steps) flow_net = flow_net.to(device) if device == 'cuda': flow_net = torch.nn.DataParallel(flow_net, args.gpu_ids) cudnn.benchmark = args.benchmark start_epoch = 0 flow_best_loss = 0 if args.resume_flow: # Load checkpoint. print('Resuming from checkpoint at ckpts/best.pth.tar...') assert os.path.isdir('ckpts'), 'Error: no checkpoint directory found!' checkpoint = torch.load('ckpts/best.pth.tar') flow_net.load_state_dict(checkpoint['net']) flow_best_loss = checkpoint['test_loss'] start_epoch = checkpoint['epoch'] loss_fn = util.NLLLoss().to(device) optimizer = optim.Adam(flow_net.parameters(), lr=args.lr) scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / args.warm_up)) return flow_net, loss_fn, optimizer, scheduler, start_epoch, flow_best_loss
def main(args): # Set up main device and scale batch size device = 'cuda' if torch.cuda.is_available() and args.gpu_ids else 'cpu' args.batch_size *= max(1, len(args.gpu_ids)) # Set random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) trainset = ImgDatasets(root_dir='data/celeba_sample', files='train_files.txt', mode=args.mode) trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) testset = ImgDatasets(root_dir='data/celeba_sample', files='test_files.txt', mode=args.mode) testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # Model print('Building model..') net = Glow(num_channels=args.num_channels, num_levels=args.num_levels, num_steps=args.num_steps, mode=args.mode) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net, args.gpu_ids) cudnn.benchmark = args.benchmark start_epoch = 0 if args.resume: # Load checkpoint. print('Resuming from checkpoint at ckpts/best.pth.tar...') assert os.path.isdir('ckpts'), 'Error: no checkpoint directory found!' checkpoint = torch.load('ckpts/best.pth.tar') net.load_state_dict(checkpoint['net']) global best_loss global global_step best_loss = checkpoint['test_loss'] start_epoch = checkpoint['epoch'] global_step = start_epoch * len(trainset) loss_fn = util.NLLLoss().to(device) optimizer = optim.Adam(net.parameters(), lr=args.lr) scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / args.warm_up)) for epoch in range(start_epoch, start_epoch + args.num_epochs): train(epoch, net, trainloader, device, optimizer, scheduler, loss_fn, args.max_grad_norm) test(epoch, net, testloader, device, loss_fn, args.mode)
def main(args): # Set up main device and scale batch size device = 'cuda' if torch.cuda.is_available() and args.gpu_ids else 'cpu' args.batch_size *= max(1, len(args.gpu_ids)) # Set random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # No normalization applied, since Glow expects inputs in (0, 1) transform_train = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) transform_test = transforms.Compose([ transforms.ToTensor() ]) trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # Model print('Building model..') net = Glow(num_channels=args.num_channels, num_levels=args.num_levels, num_steps=args.num_steps) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net, args.gpu_ids) cudnn.benchmark = args.benchmark start_epoch = 0 if args.resume: # Load checkpoint. print('Resuming from checkpoint at ckpts/best.pth.tar...') assert os.path.isdir('ckpts'), 'Error: no checkpoint directory found!' checkpoint = torch.load('ckpts/best.pth.tar') net.load_state_dict(checkpoint['net']) global best_loss global global_step best_loss = checkpoint['test_loss'] start_epoch = checkpoint['epoch'] global_step = start_epoch * len(trainset) loss_fn = util.NLLLoss().to(device) optimizer = optim.Adam(net.parameters(), lr=args.lr) scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / args.warm_up)) for epoch in range(start_epoch, start_epoch + args.num_epochs): train(epoch, net, trainloader, device, optimizer, scheduler, loss_fn, args.max_grad_norm) test(epoch, net, testloader, device, loss_fn, args.num_samples)
def main(args): # Set up main device and scale batch size wandb.init(project='dlp-lab7-task1-nf') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Set random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) trainset = ICLEVRLoader(mode="train") print('trainset: ', trainset) datasetDir_path = '/home/arg/courses/machine_learning/homework/deep_learning_and_practice/Lab7/dataset/task_1' datasetImgDir_path = '/home/arg/courses/machine_learning/homework/deep_learning_and_practice/Lab7/dataset/task_1/images' testset = Lab7_Dataset(img_path=datasetImgDir_path, json_path=os.path.join(datasetDir_path, 'test.json')) print('testset: ', testset) trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # Model print('Building model..') net = Glow(num_channels=args.num_channels, num_levels=args.num_levels, num_steps=args.num_steps) net = net.to(device) wandb.watch(net) # if device == 'cuda': # net = torch.nn.DataParallel(net, args.gpu_ids) # cudnn.benchmark = args.benchmark start_epoch = 1 # if args.resume: # # Load checkpoint. # print('Resuming from checkpoint at ckpts/best.pth.tar...') # assert os.path.isdir('ckpts'), 'Error: no checkpoint directory found!' # checkpoint = torch.load('ckpts/best.pth.tar') # net.load_state_dict(checkpoint['net']) # global best_loss # global global_step # best_loss = checkpoint['test_loss'] # start_epoch = checkpoint['epoch'] # global_step = start_epoch * len(trainset) loss_fn = util.NLLLoss().to(device) optimizer = optim.Adam(net.parameters(), lr=args.lr) scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / args.warm_up)) train(args.num_epochs, net, trainloader, device, optimizer, scheduler, loss_fn, args.max_grad_norm)
def main(args): # Set up main device and scale batch size device = 'cuda' if torch.cuda.is_available() and args.gpu_ids else 'cpu' args.batch_size *= max(1, len(args.gpu_ids)) # Set random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) trainloader = data.DataLoader(ICLEVRLoader('./'), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) test_condition = get_iCLEVR_data('./', 'test')[1] test_condition = torch.Tensor(test_condition).float() test_condition = test_condition.to(device) # Model print('Building model..') net = Glow(num_channels=args.num_channels, num_levels=args.num_levels, num_steps=args.num_steps, img_shape=(3,64,64), mode=args.mode) net = net.to(device) evaluator = evaluation_model() loss_fn = util.NLLLoss().to(device) optimizer = optim.Adam(net.parameters(), lr=args.lr) scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / args.warm_up)) start_epoch = 0 if args.resume: # Load checkpoint. print('Resuming from checkpoint') checkpoint = torch.load('savemodel/cINN/checkpoint_18.tar') net.load_state_dict(checkpoint['net']) optimizer.load_state_dict(checkpoint['optimizer']) global best_loss global global_step # best_loss = checkpoint['test_loss'] start_epoch = checkpoint['epoch'] global_step = start_epoch * len(trainloader.dataset) score_list = [] for epoch in range(start_epoch, start_epoch + args.num_epochs): train(epoch, net, trainloader, device, optimizer, scheduler, loss_fn, args.max_grad_norm) # test(epoch, net, test_condition, device, loss_fn, args.mode) score = test(epoch, net, test_condition, device, evaluator) score_list.append(score) score_list = np.asarray(score_list) print('Best epoch: %d\nBest score: %f' % (np.argmax(score_list), np.max(score_list)))
def eval(model, embedder, test_loader, opt, writer, device=None): print("EVALUATING ON VAL") model = model.eval() bpd = 0.0 loss_fn = util.NLLLoss().to(device) for i, (imgs, labels, captions) in tqdm(enumerate(test_loader)): imgs = imgs.to(device) labels = labels.to(device) with torch.no_grad(): if opt.conditioning == 'unconditional': condition_embd = None else: condition_embd = embedder(labels, captions) # outputs = model.forward(imgs, condition_embd) # loss = outputs['loss'].mean() z, sldj = model.forward(imgs, condition_embd, reverse=False) loss = loss_fn(z, sldj) / np.prod(imgs.size()[1:]) bpd += loss / np.log(2) bpd /= len(test_loader) print("VAL bpd : {}".format(bpd)) return bpd
def main(args, train): # Set up main device and scale batch size device = 'cuda' if torch.cuda.is_available() and args.gpu_ids else 'cpu' args.batch_size *= max(1, len(args.gpu_ids)) # Set random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # No normalization applied, since model expects inputs in (0, 1) transform_train = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) transform_test = transforms.Compose([ transforms.ToTensor() ]) trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # Model print('Building model..') net = FlowPlusPlus(scales=[(0, 4), (2, 3)], in_shape=(3, 32, 32), mid_channels=args.num_channels, num_blocks=args.num_blocks, num_dequant_blocks=args.num_dequant_blocks, num_components=args.num_components, use_attn=args.use_attn, drop_prob=args.drop_prob) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net, args.gpu_ids) cudnn.benchmark = args.benchmark start_epoch = 0 if args.resume: # Load checkpoint. print('Resuming from checkpoint at save/best.pth.tar...') assert os.path.isdir('save'), 'Error: no checkpoint directory found!' checkpoint = torch.load('save/best.pth.tar') net.load_state_dict(checkpoint['net']) global best_loss global global_step best_loss = checkpoint['test_loss'] start_epoch = checkpoint['epoch'] global_step = start_epoch * len(trainset) loss_fn = util.NLLLoss().to(device) param_groups = util.get_param_groups(net, args.weight_decay, norm_suffix='weight_g') optimizer = optim.Adam(param_groups, lr=args.lr) warm_up = args.warm_up * args.batch_size scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / warm_up)) for epoch in range(start_epoch, start_epoch + args.num_epochs): #train(epoch, net, trainloader, device, optimizer, scheduler, loss_fn, args.max_grad_norm) train(epoch, net, trainloader, device, optimizer, loss_fn, args.max_grad_norm, args, scheduler)
def main(args): # Set up main device and scale batch size device = 'cuda' if torch.cuda.is_available() and args.gpu_ids else 'cpu' args.batch_size *= max(1, len(args.gpu_ids)) torch.autograd.set_detect_anomaly(True) # Set random seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # No normalization applied, since model expects inputs in (0, 1) transform_train = transforms.Compose([ # transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) transform_test = transforms.Compose([transforms.ToTensor()]) trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) ################################################################ # Load CelebA instead of CIFAR10 : image_size = 32 batch_size = 16 workers = 4 # transforms_celeb = transforms.Compose([ # transforms.Resize(image_size), # transforms.CenterCrop(image_size), # transforms.ToTensor() # ]) # dataroot_train = r"./data/train" # dataroot_test = r"./data/validation" # trainset = torchvision.datasets.ImageFolder(root=dataroot_train, transform=transforms_celeb) # trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # testset = torchvision.datasets.ImageFolder(root=dataroot_test, transform=transforms_celeb) # testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms_celeb) # trainloader = data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # testset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms_celeb) # testloader = data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) import matplotlib.pyplot as plt # def imshow(img): # img = img / 2 + 0.5 # npimg = img.numpy() # plt.imshow(np.transpose(npimg, (1, 2, 0))) # plt.show() # dataiter = iter(trainloader) # images = dataiter.next() # show images # print(images[0]) # imshow(torchvision.utils.make_grid(images[0])) # Model print('Building model..') net = FlowPlusPlus(scales=[(0, 4), (2, 3)], in_shape=(1, 32, 32), mid_channels=args.num_channels, num_blocks=args.num_blocks, num_dequant_blocks=args.num_dequant_blocks, num_components=args.num_components, use_attn=args.use_attn, drop_prob=args.drop_prob) net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net, args.gpu_ids) cudnn.benchmark = args.benchmark start_epoch = 0 if args.resume: # Load checkpoint. print('Resuming from checkpoint at save/best.pth.tar...') assert os.path.isdir('save'), 'Error: no checkpoint directory found!' checkpoint = torch.load('save/best.pth.tar') net.load_state_dict(checkpoint['net']) global best_loss global global_step best_loss = checkpoint['test_loss'] start_epoch = checkpoint['epoch'] global_step = start_epoch * len(trainset) loss_fn = util.NLLLoss().to(device) param_groups = util.get_param_groups(net, args.weight_decay, norm_suffix='weight_g') optimizer = optim.Adam(param_groups, lr=args.lr) warm_up = args.warm_up * args.batch_size scheduler = sched.LambdaLR(optimizer, lambda s: min(1., s / warm_up)) for epoch in range(start_epoch, start_epoch + args.num_epochs): train(epoch, net, trainloader, device, optimizer, scheduler, loss_fn, args.max_grad_norm) test(epoch, net, testloader, device, loss_fn, args.num_samples, args.save_dir)
def train(model, embedder, optimizer, scheduler, train_loader, val_loader, opt, writer, device=None): print("TRAINING STARTS") global global_step for epoch in range(opt.n_epochs): print("[Epoch %d/%d]" % (epoch + 1, opt.n_epochs)) model = model.train() loss_to_log = 0.0 loss_fn = util.NLLLoss().to(device) with tqdm(total=len(train_loader.dataset)) as progress_bar: for i, (imgs, labels, captions) in enumerate(train_loader): start_batch = time.time() imgs = imgs.to(device) labels = labels.to(device) with torch.no_grad(): if opt.conditioning == 'unconditional': condition_embd = None else: condition_embd = embedder(labels, captions) optimizer.zero_grad() # outputs = model.forward(imgs, condition_embd) # loss = outputs['loss'].mean() # loss.backward() # optimizer.step() z, sldj = model.forward(imgs, condition_embd, reverse=False) loss = loss_fn(z, sldj) / np.prod(imgs.size()[1:]) loss.backward() if opt.max_grad_norm > 0: util.clip_grad_norm(optimizer, opt.max_grad_norm) optimizer.step() scheduler.step(global_step) batches_done = epoch * len(train_loader) + i writer.add_scalar('train/bpd', loss / np.log(2), batches_done) loss_to_log += loss.item() # if (i + 1) % opt.print_every == 0: # loss_to_log = loss_to_log / (np.log(2) * opt.print_every) # print( # "[Epoch %d/%d] [Batch %d/%d] [bpd: %f] [Time/batch %.3f]" # % (epoch + 1, opt.n_epochs, i + 1, len(train_loader), loss_to_log, time.time() - start_batch) # ) progress_bar.set_postfix(bpd=(loss_to_log / np.log(2)), lr=optimizer.param_groups[0]['lr']) progress_bar.update(imgs.size(0)) global_step += imgs.size(0) loss_to_log = 0.0 if (batches_done + 1) % opt.sample_interval == 0: print("sampling_images") model = model.eval() sample_image(model, embedder, opt.output_dir, n_row=4, batches_done=batches_done, dataloader=val_loader, device=device) val_bpd = eval(model, embedder, val_loader, opt, writer, device=device) writer.add_scalar("val/bpd", val_bpd, (epoch + 1) * len(train_loader)) torch.save( model.state_dict(), os.path.join(opt.output_dir, 'models', 'epoch_{}.pt'.format(epoch)))
net.load_state_dict({ k.replace('module.', ''): v for k, v in torch.load("ckpts/-1.pth.tar")['net'].items() }) net.eval() #testset = dataset(-2, transform, test=True,rotation_data=True) testset = torchvision.datasets.CIFAR10(root='dataset/cifar10-torchvision', train=False, download=True, transform=transform) #testset = imagenet_val(transform) testloader = data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=8) loss_fn = util.NLLLoss().to(device) loss_meter = util.AverageMeter() bpd_sum = 0 n = 0 for x, _ in testloader: #x = x.to(device) z, sldj = net(x, reverse=False) loss = loss_fn(z, sldj) loss_meter.update(loss.item(), x.size(0)) n += 1 bpd_sum += util.bits_per_dim(x, loss_meter.avg) #print(util.bits_per_dim(x, loss_meter.avg)) #print(bpd_sum/n) print(bpd_sum / n) for i in range(3):