def main(args): with open(args.config_path, 'r') as json_file: config = json.load(json_file) seed = config['seed'] validation_size = config['validation_size'] n_epochs = config['n_epochs'] linear_scheduler = config['linear_scheduler'] train_batch_size = config['train_batch_size'] train_batch_split = config['train_batch_split'] test_batch_size = config['test_batch_size'] save_last = config['save_last'] save_best = config['save_best'] neg_sample = config['neg_sample'] train_data_path = config['train_data_path'] validation_data_path = config['validation_data_path'] set_seed(seed) train_val_split_dataset(args.data_path, train_data_path, validation_data_path, validation_size, seed) model, vocab = get_model_vocab(config) train_dataset = get_train_dataset(train_data_path, vocab, model.n_pos_embeddings, neg_sample) val_dataset = get_train_dataset(validation_data_path, vocab, model.n_pos_embeddings) if linear_scheduler: config['lr_decay'] = 1 / ( n_epochs * (len(train_dataset) + train_batch_size - 1) // train_batch_size) trainer = get_trainer(config, model) trainer.train(train_data=train_dataset, n_epochs=n_epochs, train_batch_size=train_batch_size, train_batch_split=train_batch_split, test_data=val_dataset, test_batch_size=test_batch_size, save_last=save_last, save_best=save_best)
CUDA = torch.cuda.is_available() data_dir = "E:/data/dog-breed-identification/" train_dir = os.path.join(data_dir, 'train') test_dir = os.path.join(data_dir, 'test') data_train_csv = os.path.join(data_dir, 'labels.csv') data_train_csv = pd.read_csv(data_train_csv) filenames = data_train_csv.id.values le = LabelEncoder() labels = le.fit_transform(data_train_csv.breed) filenames_train, filenames_val, labels_train, labels_val = \ train_test_split(filenames, labels, test_size=0.1, stratify=labels) dog_train = get_train_dataset(filenames_train, labels_train, BATCH_SIZE, rootdir=train_dir) dog_val = get_train_dataset(filenames_val, labels_val, BATCH_SIZE, rootdir=train_dir) net = get_resnet50(n_class=len(le.classes_)) criterion_train = nn.CrossEntropyLoss() criterion_val = nn.CrossEntropyLoss() optimizer = optim.Adam(net.fc.parameters(), lr=0.0001) # use default learning rate state = {'val_acc': [], 'lives': 4, 'best_val_acc': 0} if CUDA:
from utils import make_train_patches, get_train_dataset PATCH_SIZE = 128 DROPOUT_RATE = 0.2 STEP = 16 ALPHA = 0.001 BATCH_SIZE = 32 LEARNING_RATE = 1e-4 LOG_NUM = 1 EPOCHS = 10 # set paths train_data_dir = '../data/train_data' validation_data_dir = '../data/validation_data' train_dataset = get_train_dataset(train_data_dir, PATCH_SIZE, STEP, BATCH_SIZE, ALPHA) validation_dataset = get_train_dataset(validation_data_dir, PATCH_SIZE, PATCH_SIZE, BATCH_SIZE, ALPHA) # get unet model model = get_unet(PATCH_SIZE, DROPOUT_RATE) model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy']) # set checkpoint ckpt_dir = 'ckpt_{}'.format(LOG_NUM) os.makedirs(ckpt_dir, exist_ok=True) ckpt_file = os.path.join(ckpt_dir, 'cp.ckpt') ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_file,
def __init__(self, CONFIG): if CONFIG.mode==1 : gen_model = get_model('G', CONFIG.gen_model) dis_model = get_model('D', CONFIG.dis_model) VGG = tl.models.vgg19(pretrained=True, end_with='pool4', mode='static') lr_init = 1e-4 lr_v = tf1.Variable(lr_init) beta1 = 0.9 n_epoch_init = CONFIG.init_epoch # n_epoch_init =20 # n_epoch = CONFIG.total_epoch # n_epoch = 100 # batch_size = CONFIG.batch_size # batch_size = 8 # decay_every = int(n_epoch/ 2) lr_decay = 0.1 resume_epoch = 0 if CONFIG.load_weights: resume_epoch = CONFIG.model_epoch if CONFIG.gan_init: gen_model.load_weights('Checkpoints/GAN_INIT_{}_EPID_{}.h5'.format(CONFIG.gen_model, CONFIG.model_epoch)) resume_epoch = 0 else: gen_model.load_weights('Checkpoints/GAN_{}_EPID_{}.h5'.format(CONFIG.gen_model, CONFIG.model_epoch)) dis_model.load_weights('Checkpoints/DIS_{}_GAN_{}_EPID_{}.h5'.format(CONFIG.dis_model, CONFIG.gen_model, CONFIG.model_epoch)) g_optimizer_init = tf2.optimizers.Adam(lr_v, beta_1=0.9, beta_2=0.999, epsilon=1e-07) g_optimizer = tf2.optimizers.Adam(lr_v, beta_1=0.9, beta_2=0.999, epsilon=1e-07) d_optimizer = tf2.optimizers.Adam(lr_v, beta_1=0.9, beta_2=0.999, epsilon=1e-07) gen_model.train() dis_model.train() VGG.train() train_ds = get_train_dataset(CONFIG) if not CONFIG.load_weights or CONFIG.gan_init: print('## initial learning (G)') for epoch in range(n_epoch_init): for step, (lr_patchs, hr_patchs) in enumerate(train_ds): if lr_patchs.shape[0] != batch_size: break step_time = time.time() with tf1.GradientTape() as tape: out_bicu = generate_bicubic_samples(lr_patchs.numpy(), CONFIG) #print( lr_patchs.shape, out_bicu.shape) fake_patchs = gen_model([lr_patchs, out_bicu]) mse_loss = tl.cost.mean_squared_error(fake_patchs, hr_patchs, is_mean=True) grad = tape.gradient(mse_loss, gen_model.trainable_weights) g_optimizer_init.apply_gradients(zip(grad, gen_model.trainable_weights)) print("Epoch: [{}/{}] step: [{}/{}] time: {:.3f}s, mse: {:.6f} ".format( epoch+1+resume_epoch, resume_epoch+n_epoch_init, step+1, CONFIG.no_of_batches, time.time() - step_time, mse_loss)) path = 'Training_outputs/gan_init_{}_train_{}.png'.format(CONFIG.gen_model, epoch+1+resume_epoch) tl.vis.save_images(fake_patchs.numpy(), [2, CONFIG.batch_size//2], path) if ((epoch+1+resume_epoch) % CONFIG.save_interval) == 0: gen_model.save_weights('Checkpoints/GAN_INIT_{}_EPID_{}.h5'.format(CONFIG.gen_model, epoch+1+resume_epoch)) gen_model.save_weights('Checkpoints/GAN_INIT_{}_EPID_{}.h5'.format(CONFIG.gen_model, n_epoch_init + resume_epoch)) print('## adversarial learning (G, D)') for epoch in range(n_epoch): for step, (lr_patchs, hr_patchs) in enumerate(train_ds): if lr_patchs.shape[0] != batch_size: # if the remaining data in this epoch < batch_size break step_time = time.time() with tf1.GradientTape(persistent=True) as tape: #out_bicu = generate_bicubic_samples(np.squeeze(lr_patchs,axis=3), CONFIG.scale) out_bicu = generate_bicubic_samples(lr_patchs.numpy(), CONFIG) #print( lr_patchs.shape, out_bicu.shape) fake_patchs = gen_model([lr_patchs, out_bicu]) logits_fake = dis_model(fake_patchs) logits_real = dis_model(hr_patchs) feature_fake = VGG((fake_patchs+1)/2.) # the pre-trained VGG uses the input range of [0, 1] feature_real = VGG((hr_patchs+1)/2.) d_loss1 = tl.cost.sigmoid_cross_entropy(logits_real, tf1.ones_like(logits_real)) d_loss2 = tl.cost.sigmoid_cross_entropy(logits_fake, tf1.zeros_like(logits_fake)) d_loss = d_loss1 + d_loss2 g_gan_loss = 1e-3 * tl.cost.sigmoid_cross_entropy(logits_fake, tf1.ones_like(logits_fake)) mse_loss = tl.cost.mean_squared_error(fake_patchs, hr_patchs, is_mean=True) vgg_loss = 2e-6 * tl.cost.mean_squared_error(feature_fake, feature_real, is_mean=True) g_loss = mse_loss + vgg_loss + g_gan_loss grad = tape.gradient(g_loss, gen_model.trainable_weights) g_optimizer.apply_gradients(zip(grad, gen_model.trainable_weights)) grad = tape.gradient(d_loss, dis_model.trainable_weights) d_optimizer.apply_gradients(zip(grad, dis_model.trainable_weights)) print("Epoch: [{}/{}] step: [{}/{}] time: {:.3f}s, g_loss(mse:{:.6f}, vgg:{:.6f}, adv:{:.6f}) d_loss: {:.6f}".format( epoch+1+resume_epoch, resume_epoch + n_epoch, step+1, CONFIG.no_of_batches, time.time() - step_time, mse_loss, vgg_loss, g_gan_loss, d_loss)) # update the learning rate '''if (epoch+resume_epoch) % decay_every == 0: new_lr_decay = lr_decay**((epoch+resume_epoch)// decay_every) lr_v.assign(lr_init * new_lr_decay) log = " ** new learning rate: %f (for GAN)" % (lr_init * new_lr_decay) print(log) ''' if (epoch+1+resume_epoch)% CONFIG.save_interval == 0: gen_model.save_weights('Checkpoints/GAN_{}_EPID_{}.h5'.format(CONFIG.gen_model, epoch+1+resume_epoch)) dis_model.save_weights('Checkpoints/DIS_{}_GAN_{}_EPID_{}.h5'.format(CONFIG.dis_model, CONFIG.gen_model, epoch+1+resume_epoch)) print("Save time: {}content".format(time.asctime( time.localtime(time.time())))) for i in range(CONFIG.batch_size): if CONFIG.gen_model==1: lrimg = np.squeeze(lr_patchs[i], axis =-1) lrimg = np.pad(lrimg, ((64, 64), (64, 64)), constant_values=(255.0)) #opimg = cast_uint8(fake_patchs[i].numpy()) opimg = fake_patchs[i].numpy() combine_imgs = np.concatenate((lrimg[:,:,np.newaxis], out_bicu[i], opimg, hr_patchs[i]), axis = 1) else: lrimg = np.pad(lr_patchs[i], ((192, 192), (192, 192), (0, 0)), constant_values=(255.0)) #opimg = cast_uint8(fake_patchs[i].numpy()) opimg = fake_patchs[i].numpy() combine_imgs = np.concatenate((lrimg, out_bicu[i], opimg, hr_patchs[i]), axis = 1) path = 'Training_outputs/id_{}_gan_{}_train_{}.png'.format(i+1, CONFIG.gen_model, epoch+1+resume_epoch) tl.vis.save_image(combine_imgs,path) gen_model.save_weights('Checkpoints/GAN_{}_FINAL.h5'.format(CONFIG.gen_model)) dis_model.save_weights('Checkpoints/DIS_{}_GAN_{}_FINAL.h5'.format(CONFIG.dis_model, CONFIG.gen_model)) elif CONFIG.mode==2: ## Validation model = get_model('G', CONFIG.gen_model) model.load_weights('Checkpoints/GAN_{}_EPID_{}.h5'.format(CONFIG.gen_model, CONFIG.model_epoch)) model.eval() ## disable dropout, batch norm moving avg ... save_time = time.time() ## Reading Validation dataset lrimg_file_list = tl.files.load_file_list(path=CONFIG.dir_val_in, regx='.*.png', printable=False) hrimg_file_list = tl.files.load_file_list(path=CONFIG.dir_val_target, regx='.*.png', printable=False) lrimg_file_list.sort(key=tl.files.natural_keys) hrimg_file_list.sort(key=tl.files.natural_keys) lrimg_list = np.array(tl.vis.read_images(lrimg_file_list, path=CONFIG.dir_val_in, n_threads=32)) hrimg_list = np.array(tl.vis.read_images(hrimg_file_list, path=CONFIG.dir_val_target, n_threads=32)) if CONFIG.gen_model==1: lrimg_list = lrimg_list[:,:,:,np.newaxis] hrimg_list = hrimg_list[:,:,:,np.newaxis] bcimg_list = generate_bicubic_samples(lrimg_list,CONFIG) opimg_list = model([tf1.cast(lrimg_list,tf1.float32), tf1.cast(bcimg_list,tf1.float32)]) opimg_list = opimg_list.numpy() bicubic_psnr, model_psnr = PSNR (hrimg_list , bcimg_list, opimg_list) bicubic_ssim, model_ssim = SSIM (hrimg_list , bcimg_list, opimg_list) for i in range(lrimg_list.shape[0]): name= lrimg_file_list[i].split('/')[-1].split('.')[0] if CONFIG.gen_model==1: lrimg = np.pad(lrimg_list[i], ((64, 64), (64, 64),(0, 0)), constant_values=(255.0)) else: lrimg = np.pad(lrimg_list[i], ((192, 192), (192, 192), (0, 0)), constant_values=(255.0)) combine_imgs = np.concatenate((lrimg, bcimg_list[i], opimg_list[i], hrimg_list[i]), axis = 1) path = 'Validation_outputs/{}_gan_{}_val_{}.png'.format(name, CONFIG.gen_model, CONFIG.model_epoch) tl.vis.save_image(combine_imgs, path) print(np.stack((model_psnr, bicubic_psnr), axis=-1)) print(np.stack((model_ssim, bicubic_ssim), axis=-1)) print(np.subtract(model_psnr, bicubic_psnr)) print('SUM(PSNR DIFF): {}'.format(np.sum(np.subtract(model_psnr, bicubic_psnr)))) print('AVG MODEL PSNR: {}, AVG BICUBIC PSNR: {}'.format(np.sum(model_psnr)/lrimg_list.shape[0], np.sum(bicubic_psnr)/lrimg_list.shape[0])) print('SUM(SSIM DIFF): {}'.format(np.sum(np.subtract(model_ssim, bicubic_ssim)))) print('AVG MODEL SSIM: {}, AVG BICUBIC SSIM: {}'.format(np.sum(model_ssim)/lrimg_list.shape[0], np.sum(bicubic_ssim)/lrimg_list.shape[0])) print((time.time()-save_time)/10)
def train(args): if not os.path.exists(args.out): os.makedirs(args.out) _iter = 0 domA_train, domB_train = get_train_dataset(args) size = args.resize // 64 dim = 512 e_common = E_common(args.sep, size, dim=dim) e_separate_A = E_separate_A(args.sep, size) e_separate_B = E_separate_B(args.sep, size) decoder = Decoder(size, dim=dim) disc = Disc(args.sep, size, dim=dim) A_label = torch.full((args.bs, ), 1) B_label = torch.full((args.bs, ), 0) zero_encoding = torch.full((args.bs, args.sep * (size) * (size)), 0) one_encoding = torch.full((args.bs, args.sep * (size) * (size)), 1) l1 = nn.L1Loss() bce = nn.BCELoss() if torch.cuda.is_available(): e_common = e_common.cuda() e_separate_A = e_separate_A.cuda() e_separate_B = e_separate_B.cuda() decoder = decoder.cuda() disc = disc.cuda() A_label = A_label.cuda() B_label = B_label.cuda() zero_encoding = zero_encoding.cuda() one_encoding = one_encoding.cuda() l1 = l1.cuda() bce = bce.cuda() ae_params = list(e_common.parameters()) + list( e_separate_A.parameters()) + list(e_separate_B.parameters()) + list( decoder.parameters()) ae_optimizer = optim.Adam(ae_params, lr=args.lr, betas=(0.5, 0.999)) disc_params = disc.parameters() disc_optimizer = optim.Adam(disc_params, lr=args.disclr, betas=(0.5, 0.999)) if args.load != '': save_file = os.path.join(args.load, 'checkpoint') _iter = load_model(save_file, e_common, e_separate_A, e_separate_B, decoder, ae_optimizer, disc, disc_optimizer) e_common = e_common.train() e_separate_A = e_separate_A.train() e_separate_B = e_separate_B.train() decoder = decoder.train() disc = disc.train() logger = Logger(args.out) print('Started training...') while True: domA_loader = torch.utils.data.DataLoader(domA_train, batch_size=args.bs, shuffle=True, num_workers=2) domB_loader = torch.utils.data.DataLoader(domB_train, batch_size=args.bs, shuffle=True, num_workers=2) if _iter >= args.iters: break for domA_img, domB_img in zip(domA_loader, domB_loader): if domA_img.size(0) != args.bs or domB_img.size(0) != args.bs: break domA_img = Variable(domA_img) domB_img = Variable(domB_img) if torch.cuda.is_available(): domA_img = domA_img.cuda() domB_img = domB_img.cuda() domA_img = domA_img.view((-1, 3, args.resize, args.resize)) domB_img = domB_img.view((-1, 3, args.resize, args.resize)) ae_optimizer.zero_grad() A_common = e_common(domA_img) A_separate_A = e_separate_A(domA_img) A_separate_B = e_separate_B(domA_img) if args.no_flag: A_encoding = torch.cat([A_common, A_separate_A, A_separate_A], dim=1) else: A_encoding = torch.cat([A_common, A_separate_A, zero_encoding], dim=1) B_common = e_common(domB_img) B_separate_A = e_separate_A(domB_img) B_separate_B = e_separate_B(domB_img) if args.one_encoding: B_encoding = torch.cat([B_common, B_separate_B, one_encoding], dim=1) elif args.no_flag: B_encoding = torch.cat([B_common, B_separate_B, B_separate_B], dim=1) else: B_encoding = torch.cat([B_common, zero_encoding, B_separate_B], dim=1) A_decoding = decoder(A_encoding) B_decoding = decoder(B_encoding) A_reconstruction_loss = l1(A_decoding, domA_img) B_reconstruction_loss = l1(B_decoding, domB_img) A_separate_B_loss = l1(A_separate_B, zero_encoding) B_separate_A_loss = l1(B_separate_A, zero_encoding) logger.add_value('A_recon', A_reconstruction_loss) logger.add_value('B_recon', B_reconstruction_loss) logger.add_value('A_sep_B', A_separate_B_loss) logger.add_value('B_sep_A', B_separate_A_loss) loss = 0 if args.reconweight > 0: loss += args.reconweight * (A_reconstruction_loss + B_reconstruction_loss) if args.zeroweight > 0: loss += args.zeroweight * (A_separate_B_loss + B_separate_A_loss) if args.discweight > 0: preds_A = disc(A_common) preds_B = disc(B_common) distribution_adverserial_loss = args.discweight * \ (bce(preds_A, B_label) + bce(preds_B, B_label)) logger.add_value('distribution_adverserial', distribution_adverserial_loss) loss += distribution_adverserial_loss loss.backward() torch.nn.utils.clip_grad_norm_(ae_params, 5) ae_optimizer.step() if args.discweight > 0: disc_optimizer.zero_grad() A_common = e_common(domA_img) B_common = e_common(domB_img) disc_A = disc(A_common) disc_B = disc(B_common) loss = bce(disc_A, A_label) + bce(disc_B, B_label) logger.add_value('dist_disc', loss) loss.backward() torch.nn.utils.clip_grad_norm_(disc_params, 5) disc_optimizer.step() if _iter % args.progress_iter == 0: print('Outfile: %s <<>> Iteration %d' % (args.out, _iter)) if _iter % args.log_iter == 0: logger.log(_iter) logger.reset() if _iter % args.display_iter == 0: e_common = e_common.eval() e_separate_A = e_separate_A.eval() e_separate_B = e_separate_B.eval() decoder = decoder.eval() import pdb pdb.set_trace() save_imgs(args, e_common, e_separate_A, e_separate_B, decoder, _iter, size=size, BtoA=True) save_imgs(args, e_common, e_separate_A, e_separate_B, decoder, _iter, size=size, BtoA=False) save_stripped_imgs(args, e_common, e_separate_A, e_separate_B, decoder, _iter, size=size, A=True) save_stripped_imgs(args, e_common, e_separate_A, e_separate_B, decoder, _iter, size=size, A=False) e_common = e_common.train() e_separate_A = e_separate_A.train() e_separate_B = e_separate_B.train() decoder = decoder.train() if _iter % args.save_iter == 0: save_file = os.path.join(args.out, 'checkpoint') save_model(save_file, e_common, e_separate_A, e_separate_B, decoder, ae_optimizer, disc, disc_optimizer, _iter) _iter += 1
def main(): print(colored('Configuration', 'blue')) args = parser.parse_args() p = create_config(args.config) global log_wandb log_wandb = args.wandb if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) seed_init(args.seed) if log_wandb: print(colored('Using Wandb', 'blue')) now = datetime.now().strftime("%d-%b %H:%M") wandb.init(project="Face-Unlock", name=f"Run_{now}") config = wandb.config config.batch_size = p.batch_size config.epochs = p.epochs config.learning_rate = p.optimizer_kwargs.lr config.scheduler = p.scheduler config.fc_layer_size = p.fc_layer_size config.train_dataset = "LFW" config.architechture = p.backbone # dataset print(colored('Get dataset and dataloaders', 'blue')) train_dataset = get_train_dataset(p, args.data_dir) print(train_dataset) val_dataset = get_val_dataset(p, args.data_dir) val_loader = get_val_loader(p, val_dataset) # model print(colored('Get model', 'blue')) model = get_model(p) model.to(DEVICE) print(model) # Optimizer print(colored('Get optimizer', 'blue')) optimizer = get_optimizer(p, model) print(optimizer) # scheduler print(colored('Get scheduler', 'blue')) scheduler = get_scheduler(p, optimizer) print(scheduler) # Loss function criterion = triplet_loss.batch_hard_triplet_loss # checkpoint if args.resume is not None and os.path.exists(args.resume): print(colored('Loading checkpoint {} ...'.format(args.resume), 'blue')) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) print( colored( 'Resuming from epoch {} with lr: {}'.format( start_epoch, optimizer.state_dict()["param_groups"][0]['lr']), 'blue')) else: print( colored( 'No checkpoint. Training from scratch.'.format(args.resume), 'blue')) start_epoch = 0 for epoch in range(start_epoch, p.epochs): epoch_loss = train(p, train_dataset, model, criterion, optimizer) scheduler.step() print(f"Epoch: {epoch} Loss: {epoch_loss:.3f}") if log_wandb: wandb.log( { "epoch_loss": epoch_loss, "lr": optimizer.state_dict()["param_groups"][0]['lr'] }, commit=True) if epoch % 5 == 0: tar, precision, accuracy, far, best_threshold = validate( model, val_loader) print( "Epoch: {}\nBest Threshold: {}\nTrue Acceptance: {:.3f}\nFalse Acceptance: {:.3f}\nPrecision: {:.3f}\nAccuracy: {:.3f}" .format(epoch, best_threshold, tar, far, precision, accuracy)) if epoch % 5 == 0: # Save model checkpoint state = { 'epoch': epoch + 1, 'embedding_dimension': p.fc_layer_size, 'batch_size_training': p.batch_size, 'model_state_dict': model.state_dict(), 'model_architecture': p.backbone, 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'best_distance_threshold': best_threshold, 'accuracy': accuracy } # Save model checkpoint now = datetime.now().strftime("%d-%b %H:%M") path = os.path.join( args.checkpoint_dir, 'model_{}_triplet_epoch_{}_{}.pt'.format( p.backbone, epoch, now)) print(colored(f'Saving checkoint at {path}', 'blue')) torch.save(state, path)
def main(argv): del argv utils.make_output_dir(FLAGS.output_dir) data_processor = utils.DataProcessor() images = utils.get_train_dataset(data_processor, FLAGS.dataset, FLAGS.batch_size) logging.info('Learning rate: %d', FLAGS.learning_rate) # Construct optimizers. optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) # Create the networks and models. generator = utils.get_generator(FLAGS.dataset) metric_net = utils.get_metric_net(FLAGS.dataset, FLAGS.num_measurements) model = cs.CS(metric_net, generator, FLAGS.num_z_iters, FLAGS.z_step_size, FLAGS.z_project_method) prior = utils.make_prior(FLAGS.num_latents) generator_inputs = prior.sample(FLAGS.batch_size) model_output = model.connect(images, generator_inputs) optimization_components = model_output.optimization_components debug_ops = model_output.debug_ops reconstructions, _ = utils.optimise_and_sample(generator_inputs, model, images, is_training=False) global_step = tf.train.get_or_create_global_step() update_op = optimizer.minimize(optimization_components.loss, var_list=optimization_components.vars, global_step=global_step) sample_exporter = file_utils.FileExporter( os.path.join(FLAGS.output_dir, 'reconstructions')) # Hooks. debug_ops['it'] = global_step # Abort training on Nans. nan_hook = tf.train.NanTensorHook(optimization_components.loss) # Step counter. step_conter_hook = tf.train.StepCounterHook() checkpoint_saver_hook = tf.train.CheckpointSaverHook( checkpoint_dir=utils.get_ckpt_dir(FLAGS.output_dir), save_secs=10 * 60) loss_summary_saver_hook = tf.train.SummarySaverHook( save_steps=FLAGS.summary_every_step, output_dir=os.path.join(FLAGS.output_dir, 'summaries'), summary_op=utils.get_summaries(debug_ops)) hooks = [ checkpoint_saver_hook, nan_hook, step_conter_hook, loss_summary_saver_hook ] if FLAGS.phase == 'train': # Start training. with tf.train.MonitoredSession(hooks=hooks) as sess: logging.info('starting training') for i in range(FLAGS.num_training_iterations): sess.run(update_op) if i % FLAGS.export_every == 0: reconstructions_np, data_np = sess.run( [reconstructions, images]) # Create an object which gets data and does the processing. data_np = data_processor.postprocess(data_np) reconstructions_np = data_processor.postprocess( reconstructions_np) sample_exporter.save(reconstructions_np, 'reconstructions') sample_exporter.save(data_np, 'data') else: saver = tf.train.Saver() # Start testing with tf.Session() as sess: init_op = tf.global_variables_initializer() sess.run(init_op) print(" [*] Reading checkpoint...") checkpoint_dir = utils.get_ckpt_dir(FLAGS.output_dir) ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name)) reconstructions_np, data_np = sess.run([reconstructions, images]) # Create an object which gets data and does the processing. data_np = data_processor.postprocess(data_np) reconstructions_np = data_processor.postprocess(reconstructions_np) sample_exporter.save(reconstructions_np, 'reconstructions') sample_exporter.save(data_np, 'data')
def main(cfg): os.makedirs(str(cfg.output_dir + f"/fold{cfg.fold}/"), exist_ok=True) # set random seed, works when use all data to train if cfg.seed < 0: cfg.seed = np.random.randint(1_000_000) set_seed(cfg.seed) # set dataset, dataloader train = pd.read_csv(cfg.train_df) if cfg.fold == -1: val_df = train[train["fold"] == 0] else: val_df = train[train["fold"] == cfg.fold] train_df = train[train["fold"] != cfg.fold] train_dataset = get_train_dataset(train_df, cfg) val_dataset = get_val_dataset(val_df, cfg) train_dataloader = get_train_dataloader(train_dataset, cfg) val_dataloader = get_val_dataloader(val_dataset, cfg) if cfg.train_val is True: train_val_dataset = get_val_dataset(train_df, cfg) train_val_dataloader = get_val_dataloader(train_val_dataset, cfg) to_device_transform = ToDeviced(keys=("input", "target", "mask", "is_annotated"), device=cfg.device) cfg.to_device_transform = to_device_transform # set model model = RanzcrNet(cfg) model.to(cfg.device) # set optimizer, lr scheduler total_steps = len(train_dataset) optimizer = get_optimizer(model, cfg) scheduler = get_scheduler(cfg, optimizer, total_steps) # set other tools if cfg.mixed_precision: scaler = GradScaler() else: scaler = None writer = SummaryWriter(str(cfg.output_dir + f"/fold{cfg.fold}/")) # train and val loop step = 0 i = 0 best_val_loss = np.inf optimizer.zero_grad() for epoch in range(cfg.epochs): print("EPOCH:", epoch) gc.collect() if cfg.train is True: run_train( model=model, train_dataloader=train_dataloader, optimizer=optimizer, scheduler=scheduler, cfg=cfg, scaler=scaler, writer=writer, epoch=epoch, iteration=i, step=step, ) if (epoch + 1) % cfg.eval_epochs == 0 or (epoch + 1) == cfg.epochs: val_loss = run_eval( model=model, val_dataloader=val_dataloader, cfg=cfg, writer=writer, epoch=epoch, ) if cfg.train_val is True: if (epoch + 1) % cfg.eval_train_epochs == 0 or (epoch + 1) == cfg.epochs: train_val_loss = run_eval(model, train_val_dataloader, cfg, writer, epoch) print(f"train_val_loss {train_val_loss:.5}") if val_loss < best_val_loss: print( f"SAVING CHECKPOINT: val_loss {best_val_loss:.5} -> {val_loss:.5}" ) best_val_loss = val_loss checkpoint = create_checkpoint( model, optimizer, epoch, scheduler=scheduler, scaler=scaler, ) torch.save( checkpoint, f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_best_seed{cfg.seed}.pth", )