Exemple #1
0
def main(args):
    with open(args.config_path, 'r') as json_file:
        config = json.load(json_file)

    seed = config['seed']
    validation_size = config['validation_size']
    n_epochs = config['n_epochs']
    linear_scheduler = config['linear_scheduler']
    train_batch_size = config['train_batch_size']
    train_batch_split = config['train_batch_split']
    test_batch_size = config['test_batch_size']
    save_last = config['save_last']
    save_best = config['save_best']
    neg_sample = config['neg_sample']
    train_data_path = config['train_data_path']
    validation_data_path = config['validation_data_path']

    set_seed(seed)
    train_val_split_dataset(args.data_path, train_data_path,
                            validation_data_path, validation_size, seed)
    model, vocab = get_model_vocab(config)
    train_dataset = get_train_dataset(train_data_path, vocab,
                                      model.n_pos_embeddings, neg_sample)
    val_dataset = get_train_dataset(validation_data_path, vocab,
                                    model.n_pos_embeddings)

    if linear_scheduler:
        config['lr_decay'] = 1 / (
            n_epochs *
            (len(train_dataset) + train_batch_size - 1) // train_batch_size)
    trainer = get_trainer(config, model)

    trainer.train(train_data=train_dataset,
                  n_epochs=n_epochs,
                  train_batch_size=train_batch_size,
                  train_batch_split=train_batch_split,
                  test_data=val_dataset,
                  test_batch_size=test_batch_size,
                  save_last=save_last,
                  save_best=save_best)
CUDA = torch.cuda.is_available()

data_dir = "E:/data/dog-breed-identification/"
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
data_train_csv = os.path.join(data_dir, 'labels.csv')
data_train_csv = pd.read_csv(data_train_csv)
filenames = data_train_csv.id.values
le = LabelEncoder()
labels = le.fit_transform(data_train_csv.breed)

filenames_train, filenames_val, labels_train, labels_val = \
    train_test_split(filenames, labels, test_size=0.1, stratify=labels)

dog_train = get_train_dataset(filenames_train,
                              labels_train,
                              BATCH_SIZE,
                              rootdir=train_dir)
dog_val = get_train_dataset(filenames_val,
                            labels_val,
                            BATCH_SIZE,
                            rootdir=train_dir)

net = get_resnet50(n_class=len(le.classes_))
criterion_train = nn.CrossEntropyLoss()
criterion_val = nn.CrossEntropyLoss()

optimizer = optim.Adam(net.fc.parameters(),
                       lr=0.0001)  # use default learning rate
state = {'val_acc': [], 'lives': 4, 'best_val_acc': 0}

if CUDA:
Exemple #3
0
from utils import make_train_patches, get_train_dataset

PATCH_SIZE = 128
DROPOUT_RATE = 0.2
STEP = 16
ALPHA = 0.001
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
LOG_NUM = 1
EPOCHS = 10

# set paths
train_data_dir = '../data/train_data'
validation_data_dir = '../data/validation_data'

train_dataset = get_train_dataset(train_data_dir, PATCH_SIZE, STEP, BATCH_SIZE,
                                  ALPHA)
validation_dataset = get_train_dataset(validation_data_dir, PATCH_SIZE,
                                       PATCH_SIZE, BATCH_SIZE, ALPHA)

# get unet model
model = get_unet(PATCH_SIZE, DROPOUT_RATE)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# set checkpoint
ckpt_dir = 'ckpt_{}'.format(LOG_NUM)
os.makedirs(ckpt_dir, exist_ok=True)
ckpt_file = os.path.join(ckpt_dir, 'cp.ckpt')

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_file,
Exemple #4
0
	def __init__(self, CONFIG):

		if CONFIG.mode==1 :
		
			gen_model = get_model('G', CONFIG.gen_model)
			dis_model = get_model('D', CONFIG.dis_model)
			VGG = tl.models.vgg19(pretrained=True, end_with='pool4', mode='static')

			lr_init = 1e-4
			lr_v = tf1.Variable(lr_init)
			beta1 = 0.9
			n_epoch_init = CONFIG.init_epoch # n_epoch_init =20 # 
			n_epoch = CONFIG.total_epoch # n_epoch = 100 # 
			batch_size = CONFIG.batch_size # batch_size = 8 # 
			decay_every = int(n_epoch/ 2)
			lr_decay = 0.1
			resume_epoch = 0
			 
			if CONFIG.load_weights:
			
				resume_epoch = CONFIG.model_epoch
			
				if CONFIG.gan_init:
					gen_model.load_weights('Checkpoints/GAN_INIT_{}_EPID_{}.h5'.format(CONFIG.gen_model, CONFIG.model_epoch))
					resume_epoch = 0
				else:	
					gen_model.load_weights('Checkpoints/GAN_{}_EPID_{}.h5'.format(CONFIG.gen_model, CONFIG.model_epoch))
					dis_model.load_weights('Checkpoints/DIS_{}_GAN_{}_EPID_{}.h5'.format(CONFIG.dis_model, CONFIG.gen_model, CONFIG.model_epoch))
				
		
			g_optimizer_init = tf2.optimizers.Adam(lr_v, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
			g_optimizer = tf2.optimizers.Adam(lr_v, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
			d_optimizer = tf2.optimizers.Adam(lr_v, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

			gen_model.train()
			dis_model.train()
			VGG.train()

			train_ds = get_train_dataset(CONFIG)

			if not CONFIG.load_weights or CONFIG.gan_init:
			
				print('##	initial learning (G)')
				
				for epoch in range(n_epoch_init):
					for step, (lr_patchs, hr_patchs) in enumerate(train_ds):
						if lr_patchs.shape[0] != batch_size:  
							break
						step_time = time.time()
						with tf1.GradientTape() as tape:
							out_bicu = generate_bicubic_samples(lr_patchs.numpy(), CONFIG)
							#print( lr_patchs.shape, out_bicu.shape)
							fake_patchs = gen_model([lr_patchs, out_bicu])
							
							mse_loss = tl.cost.mean_squared_error(fake_patchs, hr_patchs, is_mean=True)
						grad = tape.gradient(mse_loss, gen_model.trainable_weights)
						g_optimizer_init.apply_gradients(zip(grad, gen_model.trainable_weights))
						print("Epoch: [{}/{}] step: [{}/{}] time: {:.3f}s, mse: {:.6f} ".format(
						epoch+1+resume_epoch, resume_epoch+n_epoch_init, step+1, CONFIG.no_of_batches, time.time() - step_time, mse_loss))
					
					path = 'Training_outputs/gan_init_{}_train_{}.png'.format(CONFIG.gen_model, epoch+1+resume_epoch)
					tl.vis.save_images(fake_patchs.numpy(), [2, CONFIG.batch_size//2], path)
					
					if ((epoch+1+resume_epoch) % CONFIG.save_interval) == 0:
						gen_model.save_weights('Checkpoints/GAN_INIT_{}_EPID_{}.h5'.format(CONFIG.gen_model, epoch+1+resume_epoch))
				
				gen_model.save_weights('Checkpoints/GAN_INIT_{}_EPID_{}.h5'.format(CONFIG.gen_model, n_epoch_init + resume_epoch))
			
			
			
			print('##	adversarial learning (G, D)')
				
			for epoch in range(n_epoch):

				for step, (lr_patchs, hr_patchs) in enumerate(train_ds):
					if lr_patchs.shape[0] != batch_size: # if the remaining data in this epoch < batch_size
						break
					step_time = time.time()
					with tf1.GradientTape(persistent=True) as tape:
		      
						#out_bicu = generate_bicubic_samples(np.squeeze(lr_patchs,axis=3), CONFIG.scale)
						out_bicu = generate_bicubic_samples(lr_patchs.numpy(), CONFIG)
						#print( lr_patchs.shape, out_bicu.shape)
						fake_patchs = gen_model([lr_patchs, out_bicu])

						logits_fake = dis_model(fake_patchs)
						logits_real = dis_model(hr_patchs)

						feature_fake = VGG((fake_patchs+1)/2.) # the pre-trained VGG uses the input range of [0, 1]
						feature_real = VGG((hr_patchs+1)/2.)
		      
						d_loss1 = tl.cost.sigmoid_cross_entropy(logits_real, tf1.ones_like(logits_real))
						d_loss2 = tl.cost.sigmoid_cross_entropy(logits_fake, tf1.zeros_like(logits_fake))
						d_loss = d_loss1 + d_loss2
			      
						g_gan_loss = 1e-3 * tl.cost.sigmoid_cross_entropy(logits_fake, tf1.ones_like(logits_fake))
						mse_loss = tl.cost.mean_squared_error(fake_patchs, hr_patchs, is_mean=True)

						vgg_loss = 2e-6 * tl.cost.mean_squared_error(feature_fake, feature_real, is_mean=True)
						g_loss = mse_loss + vgg_loss + g_gan_loss

					grad = tape.gradient(g_loss, gen_model.trainable_weights)
					g_optimizer.apply_gradients(zip(grad, gen_model.trainable_weights))
					grad = tape.gradient(d_loss, dis_model.trainable_weights)
					d_optimizer.apply_gradients(zip(grad, dis_model.trainable_weights))
					print("Epoch: [{}/{}] step: [{}/{}] time: {:.3f}s, g_loss(mse:{:.6f}, vgg:{:.6f}, adv:{:.6f}) d_loss: {:.6f}".format(
					epoch+1+resume_epoch, resume_epoch + n_epoch, step+1, CONFIG.no_of_batches, time.time() - step_time, mse_loss, vgg_loss, g_gan_loss, d_loss))
					
				# update the learning rate
				'''if (epoch+resume_epoch) % decay_every == 0:
					new_lr_decay = lr_decay**((epoch+resume_epoch)// decay_every)
					lr_v.assign(lr_init * new_lr_decay)
					log = " ** new learning rate: %f (for GAN)" % (lr_init * new_lr_decay)
					print(log)
                		'''
				if (epoch+1+resume_epoch)%  CONFIG.save_interval == 0:
					gen_model.save_weights('Checkpoints/GAN_{}_EPID_{}.h5'.format(CONFIG.gen_model, epoch+1+resume_epoch))
					dis_model.save_weights('Checkpoints/DIS_{}_GAN_{}_EPID_{}.h5'.format(CONFIG.dis_model, CONFIG.gen_model, epoch+1+resume_epoch))
					print("Save time: {}content".format(time.asctime( time.localtime(time.time()))))
					for i in range(CONFIG.batch_size):
						if CONFIG.gen_model==1:
							lrimg = np.squeeze(lr_patchs[i], axis =-1)
							lrimg = np.pad(lrimg, ((64, 64), (64, 64)), constant_values=(255.0))
							#opimg = cast_uint8(fake_patchs[i].numpy())
							opimg = fake_patchs[i].numpy()
							combine_imgs = np.concatenate((lrimg[:,:,np.newaxis], out_bicu[i], opimg, hr_patchs[i]), axis = 1)
						else:
							lrimg = np.pad(lr_patchs[i], ((192, 192), (192, 192), (0, 0)), constant_values=(255.0))
							#opimg = cast_uint8(fake_patchs[i].numpy())
							opimg = fake_patchs[i].numpy()
							combine_imgs = np.concatenate((lrimg, out_bicu[i], opimg, hr_patchs[i]), axis = 1)
						path = 'Training_outputs/id_{}_gan_{}_train_{}.png'.format(i+1, CONFIG.gen_model, epoch+1+resume_epoch)
						tl.vis.save_image(combine_imgs,path)

			gen_model.save_weights('Checkpoints/GAN_{}_FINAL.h5'.format(CONFIG.gen_model))
			dis_model.save_weights('Checkpoints/DIS_{}_GAN_{}_FINAL.h5'.format(CONFIG.dis_model, CONFIG.gen_model))
					

				  

		elif CONFIG.mode==2:	## Validation

			model = get_model('G', CONFIG.gen_model)
			model.load_weights('Checkpoints/GAN_{}_EPID_{}.h5'.format(CONFIG.gen_model, CONFIG.model_epoch))
			model.eval()  ## disable dropout, batch norm moving avg ...

			save_time = time.time()
			
			## Reading Validation dataset
			lrimg_file_list = tl.files.load_file_list(path=CONFIG.dir_val_in, regx='.*.png', printable=False)
			hrimg_file_list = tl.files.load_file_list(path=CONFIG.dir_val_target, regx='.*.png', printable=False)
			lrimg_file_list.sort(key=tl.files.natural_keys)
			hrimg_file_list.sort(key=tl.files.natural_keys)
			lrimg_list = np.array(tl.vis.read_images(lrimg_file_list, path=CONFIG.dir_val_in, n_threads=32))
			hrimg_list = np.array(tl.vis.read_images(hrimg_file_list, path=CONFIG.dir_val_target, n_threads=32)) 
			
			if CONFIG.gen_model==1:
				lrimg_list = lrimg_list[:,:,:,np.newaxis] 
				hrimg_list = hrimg_list[:,:,:,np.newaxis]

			bcimg_list = generate_bicubic_samples(lrimg_list,CONFIG)
			opimg_list = model([tf1.cast(lrimg_list,tf1.float32), tf1.cast(bcimg_list,tf1.float32)]) 
			opimg_list = opimg_list.numpy()

			bicubic_psnr, model_psnr = PSNR (hrimg_list , bcimg_list, opimg_list)
			bicubic_ssim, model_ssim = SSIM (hrimg_list , bcimg_list, opimg_list)
			
			for i in range(lrimg_list.shape[0]):
				name= lrimg_file_list[i].split('/')[-1].split('.')[0]
				if CONFIG.gen_model==1:
					lrimg = np.pad(lrimg_list[i], ((64, 64), (64, 64),(0, 0)), constant_values=(255.0))
				else:
					lrimg = np.pad(lrimg_list[i], ((192, 192), (192, 192), (0, 0)), constant_values=(255.0))
				combine_imgs = np.concatenate((lrimg, bcimg_list[i], opimg_list[i], hrimg_list[i]), axis = 1)

				path = 'Validation_outputs/{}_gan_{}_val_{}.png'.format(name, CONFIG.gen_model, CONFIG.model_epoch)
				tl.vis.save_image(combine_imgs, path)
            
			print(np.stack((model_psnr, bicubic_psnr), axis=-1))
			print(np.stack((model_ssim, bicubic_ssim), axis=-1))
			print(np.subtract(model_psnr, bicubic_psnr))
			print('SUM(PSNR DIFF): {}'.format(np.sum(np.subtract(model_psnr, bicubic_psnr))))
			print('AVG MODEL PSNR: {}, AVG BICUBIC PSNR: {}'.format(np.sum(model_psnr)/lrimg_list.shape[0], np.sum(bicubic_psnr)/lrimg_list.shape[0]))
			print('SUM(SSIM DIFF): {}'.format(np.sum(np.subtract(model_ssim, bicubic_ssim))))
			print('AVG MODEL SSIM: {}, AVG BICUBIC SSIM: {}'.format(np.sum(model_ssim)/lrimg_list.shape[0], np.sum(bicubic_ssim)/lrimg_list.shape[0]))
			print((time.time()-save_time)/10)
Exemple #5
0
def train(args):
    if not os.path.exists(args.out):
        os.makedirs(args.out)

    _iter = 0
    domA_train, domB_train = get_train_dataset(args)

    size = args.resize // 64
    dim = 512

    e_common = E_common(args.sep, size, dim=dim)
    e_separate_A = E_separate_A(args.sep, size)
    e_separate_B = E_separate_B(args.sep, size)
    decoder = Decoder(size, dim=dim)
    disc = Disc(args.sep, size, dim=dim)

    A_label = torch.full((args.bs, ), 1)
    B_label = torch.full((args.bs, ), 0)
    zero_encoding = torch.full((args.bs, args.sep * (size) * (size)), 0)
    one_encoding = torch.full((args.bs, args.sep * (size) * (size)), 1)

    l1 = nn.L1Loss()
    bce = nn.BCELoss()

    if torch.cuda.is_available():
        e_common = e_common.cuda()
        e_separate_A = e_separate_A.cuda()
        e_separate_B = e_separate_B.cuda()
        decoder = decoder.cuda()
        disc = disc.cuda()

        A_label = A_label.cuda()
        B_label = B_label.cuda()
        zero_encoding = zero_encoding.cuda()
        one_encoding = one_encoding.cuda()

        l1 = l1.cuda()
        bce = bce.cuda()

    ae_params = list(e_common.parameters()) + list(
        e_separate_A.parameters()) + list(e_separate_B.parameters()) + list(
            decoder.parameters())
    ae_optimizer = optim.Adam(ae_params, lr=args.lr, betas=(0.5, 0.999))

    disc_params = disc.parameters()
    disc_optimizer = optim.Adam(disc_params,
                                lr=args.disclr,
                                betas=(0.5, 0.999))

    if args.load != '':
        save_file = os.path.join(args.load, 'checkpoint')
        _iter = load_model(save_file, e_common, e_separate_A, e_separate_B,
                           decoder, ae_optimizer, disc, disc_optimizer)

    e_common = e_common.train()
    e_separate_A = e_separate_A.train()
    e_separate_B = e_separate_B.train()
    decoder = decoder.train()
    disc = disc.train()

    logger = Logger(args.out)

    print('Started training...')
    while True:
        domA_loader = torch.utils.data.DataLoader(domA_train,
                                                  batch_size=args.bs,
                                                  shuffle=True,
                                                  num_workers=2)
        domB_loader = torch.utils.data.DataLoader(domB_train,
                                                  batch_size=args.bs,
                                                  shuffle=True,
                                                  num_workers=2)
        if _iter >= args.iters:
            break

        for domA_img, domB_img in zip(domA_loader, domB_loader):

            if domA_img.size(0) != args.bs or domB_img.size(0) != args.bs:
                break

            domA_img = Variable(domA_img)
            domB_img = Variable(domB_img)

            if torch.cuda.is_available():
                domA_img = domA_img.cuda()
                domB_img = domB_img.cuda()
            domA_img = domA_img.view((-1, 3, args.resize, args.resize))
            domB_img = domB_img.view((-1, 3, args.resize, args.resize))

            ae_optimizer.zero_grad()

            A_common = e_common(domA_img)
            A_separate_A = e_separate_A(domA_img)
            A_separate_B = e_separate_B(domA_img)
            if args.no_flag:
                A_encoding = torch.cat([A_common, A_separate_A, A_separate_A],
                                       dim=1)
            else:
                A_encoding = torch.cat([A_common, A_separate_A, zero_encoding],
                                       dim=1)
            B_common = e_common(domB_img)
            B_separate_A = e_separate_A(domB_img)
            B_separate_B = e_separate_B(domB_img)

            if args.one_encoding:
                B_encoding = torch.cat([B_common, B_separate_B, one_encoding],
                                       dim=1)
            elif args.no_flag:
                B_encoding = torch.cat([B_common, B_separate_B, B_separate_B],
                                       dim=1)
            else:
                B_encoding = torch.cat([B_common, zero_encoding, B_separate_B],
                                       dim=1)

            A_decoding = decoder(A_encoding)
            B_decoding = decoder(B_encoding)

            A_reconstruction_loss = l1(A_decoding, domA_img)
            B_reconstruction_loss = l1(B_decoding, domB_img)

            A_separate_B_loss = l1(A_separate_B, zero_encoding)
            B_separate_A_loss = l1(B_separate_A, zero_encoding)

            logger.add_value('A_recon', A_reconstruction_loss)
            logger.add_value('B_recon', B_reconstruction_loss)
            logger.add_value('A_sep_B', A_separate_B_loss)
            logger.add_value('B_sep_A', B_separate_A_loss)

            loss = 0

            if args.reconweight > 0:
                loss += args.reconweight * (A_reconstruction_loss +
                                            B_reconstruction_loss)

            if args.zeroweight > 0:
                loss += args.zeroweight * (A_separate_B_loss +
                                           B_separate_A_loss)

            if args.discweight > 0:
                preds_A = disc(A_common)
                preds_B = disc(B_common)
                distribution_adverserial_loss = args.discweight * \
                                                (bce(preds_A, B_label) + bce(preds_B, B_label))
                logger.add_value('distribution_adverserial',
                                 distribution_adverserial_loss)
                loss += distribution_adverserial_loss

            loss.backward()
            torch.nn.utils.clip_grad_norm_(ae_params, 5)
            ae_optimizer.step()

            if args.discweight > 0:
                disc_optimizer.zero_grad()

                A_common = e_common(domA_img)
                B_common = e_common(domB_img)

                disc_A = disc(A_common)
                disc_B = disc(B_common)

                loss = bce(disc_A, A_label) + bce(disc_B, B_label)
                logger.add_value('dist_disc', loss)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(disc_params, 5)
                disc_optimizer.step()

            if _iter % args.progress_iter == 0:
                print('Outfile: %s <<>> Iteration %d' % (args.out, _iter))

            if _iter % args.log_iter == 0:
                logger.log(_iter)

            logger.reset()

            if _iter % args.display_iter == 0:
                e_common = e_common.eval()
                e_separate_A = e_separate_A.eval()
                e_separate_B = e_separate_B.eval()
                decoder = decoder.eval()
                import pdb
                pdb.set_trace()
                save_imgs(args,
                          e_common,
                          e_separate_A,
                          e_separate_B,
                          decoder,
                          _iter,
                          size=size,
                          BtoA=True)
                save_imgs(args,
                          e_common,
                          e_separate_A,
                          e_separate_B,
                          decoder,
                          _iter,
                          size=size,
                          BtoA=False)
                save_stripped_imgs(args,
                                   e_common,
                                   e_separate_A,
                                   e_separate_B,
                                   decoder,
                                   _iter,
                                   size=size,
                                   A=True)
                save_stripped_imgs(args,
                                   e_common,
                                   e_separate_A,
                                   e_separate_B,
                                   decoder,
                                   _iter,
                                   size=size,
                                   A=False)

                e_common = e_common.train()
                e_separate_A = e_separate_A.train()
                e_separate_B = e_separate_B.train()
                decoder = decoder.train()

            if _iter % args.save_iter == 0:
                save_file = os.path.join(args.out, 'checkpoint')
                save_model(save_file, e_common, e_separate_A, e_separate_B,
                           decoder, ae_optimizer, disc, disc_optimizer, _iter)

            _iter += 1
Exemple #6
0
def main():

    print(colored('Configuration', 'blue'))
    args = parser.parse_args()
    p = create_config(args.config)
    global log_wandb
    log_wandb = args.wandb

    if not os.path.exists(args.checkpoint_dir):
        os.makedirs(args.checkpoint_dir)

    seed_init(args.seed)

    if log_wandb:
        print(colored('Using Wandb', 'blue'))
        now = datetime.now().strftime("%d-%b %H:%M")
        wandb.init(project="Face-Unlock", name=f"Run_{now}")
        config = wandb.config
        config.batch_size = p.batch_size
        config.epochs = p.epochs
        config.learning_rate = p.optimizer_kwargs.lr
        config.scheduler = p.scheduler
        config.fc_layer_size = p.fc_layer_size
        config.train_dataset = "LFW"
        config.architechture = p.backbone

    # dataset
    print(colored('Get dataset and dataloaders', 'blue'))
    train_dataset = get_train_dataset(p, args.data_dir)
    print(train_dataset)
    val_dataset = get_val_dataset(p, args.data_dir)
    val_loader = get_val_loader(p, val_dataset)

    # model
    print(colored('Get model', 'blue'))
    model = get_model(p)
    model.to(DEVICE)
    print(model)

    # Optimizer
    print(colored('Get optimizer', 'blue'))
    optimizer = get_optimizer(p, model)
    print(optimizer)

    # scheduler
    print(colored('Get scheduler', 'blue'))
    scheduler = get_scheduler(p, optimizer)
    print(scheduler)

    # Loss function
    criterion = triplet_loss.batch_hard_triplet_loss

    # checkpoint
    if args.resume is not None and os.path.exists(args.resume):
        print(colored('Loading checkpoint {} ...'.format(args.resume), 'blue'))
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        print(
            colored(
                'Resuming from epoch {} with lr: {}'.format(
                    start_epoch,
                    optimizer.state_dict()["param_groups"][0]['lr']), 'blue'))

    else:
        print(
            colored(
                'No checkpoint. Training from scratch.'.format(args.resume),
                'blue'))
        start_epoch = 0

    for epoch in range(start_epoch, p.epochs):

        epoch_loss = train(p, train_dataset, model, criterion, optimizer)
        scheduler.step()
        print(f"Epoch: {epoch} Loss: {epoch_loss:.3f}")
        if log_wandb:
            wandb.log(
                {
                    "epoch_loss": epoch_loss,
                    "lr": optimizer.state_dict()["param_groups"][0]['lr']
                },
                commit=True)

        if epoch % 5 == 0:
            tar, precision, accuracy, far, best_threshold = validate(
                model, val_loader)
            print(
                "Epoch: {}\nBest Threshold: {}\nTrue Acceptance: {:.3f}\nFalse Acceptance: {:.3f}\nPrecision: {:.3f}\nAccuracy: {:.3f}"
                .format(epoch, best_threshold, tar, far, precision, accuracy))

        if epoch % 5 == 0:
            # Save model checkpoint
            state = {
                'epoch': epoch + 1,
                'embedding_dimension': p.fc_layer_size,
                'batch_size_training': p.batch_size,
                'model_state_dict': model.state_dict(),
                'model_architecture': p.backbone,
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_distance_threshold': best_threshold,
                'accuracy': accuracy
            }
            # Save model checkpoint
            now = datetime.now().strftime("%d-%b %H:%M")
            path = os.path.join(
                args.checkpoint_dir, 'model_{}_triplet_epoch_{}_{}.pt'.format(
                    p.backbone, epoch, now))
            print(colored(f'Saving checkoint at {path}', 'blue'))
            torch.save(state, path)
Exemple #7
0
def main(argv):
    del argv

    utils.make_output_dir(FLAGS.output_dir)
    data_processor = utils.DataProcessor()
    images = utils.get_train_dataset(data_processor, FLAGS.dataset,
                                     FLAGS.batch_size)

    logging.info('Learning rate: %d', FLAGS.learning_rate)

    # Construct optimizers.
    optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)

    # Create the networks and models.
    generator = utils.get_generator(FLAGS.dataset)
    metric_net = utils.get_metric_net(FLAGS.dataset, FLAGS.num_measurements)

    model = cs.CS(metric_net, generator, FLAGS.num_z_iters, FLAGS.z_step_size,
                  FLAGS.z_project_method)
    prior = utils.make_prior(FLAGS.num_latents)
    generator_inputs = prior.sample(FLAGS.batch_size)

    model_output = model.connect(images, generator_inputs)
    optimization_components = model_output.optimization_components
    debug_ops = model_output.debug_ops
    reconstructions, _ = utils.optimise_and_sample(generator_inputs,
                                                   model,
                                                   images,
                                                   is_training=False)

    global_step = tf.train.get_or_create_global_step()
    update_op = optimizer.minimize(optimization_components.loss,
                                   var_list=optimization_components.vars,
                                   global_step=global_step)

    sample_exporter = file_utils.FileExporter(
        os.path.join(FLAGS.output_dir, 'reconstructions'))

    # Hooks.
    debug_ops['it'] = global_step
    # Abort training on Nans.
    nan_hook = tf.train.NanTensorHook(optimization_components.loss)
    # Step counter.
    step_conter_hook = tf.train.StepCounterHook()

    checkpoint_saver_hook = tf.train.CheckpointSaverHook(
        checkpoint_dir=utils.get_ckpt_dir(FLAGS.output_dir), save_secs=10 * 60)

    loss_summary_saver_hook = tf.train.SummarySaverHook(
        save_steps=FLAGS.summary_every_step,
        output_dir=os.path.join(FLAGS.output_dir, 'summaries'),
        summary_op=utils.get_summaries(debug_ops))

    hooks = [
        checkpoint_saver_hook, nan_hook, step_conter_hook,
        loss_summary_saver_hook
    ]

    if FLAGS.phase == 'train':
        # Start training.
        with tf.train.MonitoredSession(hooks=hooks) as sess:
            logging.info('starting training')

            for i in range(FLAGS.num_training_iterations):
                sess.run(update_op)

                if i % FLAGS.export_every == 0:
                    reconstructions_np, data_np = sess.run(
                        [reconstructions, images])
                    # Create an object which gets data and does the processing.
                    data_np = data_processor.postprocess(data_np)
                    reconstructions_np = data_processor.postprocess(
                        reconstructions_np)
                    sample_exporter.save(reconstructions_np, 'reconstructions')
                    sample_exporter.save(data_np, 'data')
    else:
        saver = tf.train.Saver()
        # Start testing
        with tf.Session() as sess:

            init_op = tf.global_variables_initializer()
            sess.run(init_op)

            print(" [*] Reading checkpoint...")
            checkpoint_dir = utils.get_ckpt_dir(FLAGS.output_dir)

            ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
            if ckpt and ckpt.model_checkpoint_path:
                ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
                saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name))

            reconstructions_np, data_np = sess.run([reconstructions, images])
            # Create an object which gets data and does the processing.
            data_np = data_processor.postprocess(data_np)
            reconstructions_np = data_processor.postprocess(reconstructions_np)
            sample_exporter.save(reconstructions_np, 'reconstructions')
            sample_exporter.save(data_np, 'data')
Exemple #8
0
def main(cfg):

    os.makedirs(str(cfg.output_dir + f"/fold{cfg.fold}/"), exist_ok=True)

    # set random seed, works when use all data to train
    if cfg.seed < 0:
        cfg.seed = np.random.randint(1_000_000)
    set_seed(cfg.seed)

    # set dataset, dataloader
    train = pd.read_csv(cfg.train_df)

    if cfg.fold == -1:
        val_df = train[train["fold"] == 0]
    else:
        val_df = train[train["fold"] == cfg.fold]
    train_df = train[train["fold"] != cfg.fold]

    train_dataset = get_train_dataset(train_df, cfg)
    val_dataset = get_val_dataset(val_df, cfg)

    train_dataloader = get_train_dataloader(train_dataset, cfg)
    val_dataloader = get_val_dataloader(val_dataset, cfg)

    if cfg.train_val is True:
        train_val_dataset = get_val_dataset(train_df, cfg)
        train_val_dataloader = get_val_dataloader(train_val_dataset, cfg)

    to_device_transform = ToDeviced(keys=("input", "target", "mask",
                                          "is_annotated"),
                                    device=cfg.device)
    cfg.to_device_transform = to_device_transform
    # set model

    model = RanzcrNet(cfg)
    model.to(cfg.device)

    # set optimizer, lr scheduler
    total_steps = len(train_dataset)

    optimizer = get_optimizer(model, cfg)
    scheduler = get_scheduler(cfg, optimizer, total_steps)

    # set other tools
    if cfg.mixed_precision:
        scaler = GradScaler()
    else:
        scaler = None

    writer = SummaryWriter(str(cfg.output_dir + f"/fold{cfg.fold}/"))

    # train and val loop
    step = 0
    i = 0
    best_val_loss = np.inf
    optimizer.zero_grad()
    for epoch in range(cfg.epochs):
        print("EPOCH:", epoch)
        gc.collect()
        if cfg.train is True:
            run_train(
                model=model,
                train_dataloader=train_dataloader,
                optimizer=optimizer,
                scheduler=scheduler,
                cfg=cfg,
                scaler=scaler,
                writer=writer,
                epoch=epoch,
                iteration=i,
                step=step,
            )

        if (epoch + 1) % cfg.eval_epochs == 0 or (epoch + 1) == cfg.epochs:
            val_loss = run_eval(
                model=model,
                val_dataloader=val_dataloader,
                cfg=cfg,
                writer=writer,
                epoch=epoch,
            )

        if cfg.train_val is True:
            if (epoch + 1) % cfg.eval_train_epochs == 0 or (epoch +
                                                            1) == cfg.epochs:
                train_val_loss = run_eval(model, train_val_dataloader, cfg,
                                          writer, epoch)
                print(f"train_val_loss {train_val_loss:.5}")

        if val_loss < best_val_loss:
            print(
                f"SAVING CHECKPOINT: val_loss {best_val_loss:.5} -> {val_loss:.5}"
            )
            best_val_loss = val_loss

            checkpoint = create_checkpoint(
                model,
                optimizer,
                epoch,
                scheduler=scheduler,
                scaler=scaler,
            )
            torch.save(
                checkpoint,
                f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_best_seed{cfg.seed}.pth",
            )