loss_g.backward() optimizer_g.step() print("===> Epoch[{}]({}/{}): Loss_D: {:.4f} Loss_G: {:.4f}".format( epoch, iteration, len(training_data_loader), loss_d.item(), loss_g.item())) if iteration % 100 == 0 or iteration == 1: I = real_a.detach().cpu().numpy().transpose((0, 2, 3, 1)) I = I[:, :, :, 0:1] im = grid_of_images_default(I) imsave(f"input_{epoch:05d}.png", im) R = real_b.detach().cpu().numpy().transpose((0, 2, 3, 1)) R = (R + 1) / 2 im = grid_of_images_default(R) imsave(f"real_{epoch:05d}.png", im) F = fake_b.detach().cpu().numpy().transpose((0, 2, 3, 1)) F = (F + 1) / 2 im = grid_of_images_default(F) imsave(f"fake_{epoch:05d}.png", im) net_g_model_out_path = "netG_model_epoch_{}.pth".format(epoch) net_d_model_out_path = "netD_model_epoch_{}.pth".format(epoch) torch.save(net_g, net_g_model_out_path) torch.save(net_d, net_d_model_out_path) print("Checkpoint saved to {}".format("checkpoint" + opt.dataset)) update_learning_rate(net_g_scheduler, optimizer_g) update_learning_rate(net_d_scheduler, optimizer_d)
def train(opt): #### device device = torch.device('cuda:{}'.format(opt.gpu_id) if opt.gpu_id >= 0 else torch.device('cpu')) #### dataset data_loader = UnAlignedDataLoader() data_loader.initialize(opt) data_set = data_loader.load_data() print("The number of training images = %d." % len(data_set)) #### initialize models ## declaration E_a2Zb = Encoder(input_nc=opt.input_nc, ngf=opt.ngf, norm_type=opt.norm_type, use_dropout=not opt.no_dropout, n_blocks=9) G_Zb2b = Decoder(output_nc=opt.output_nc, ngf=opt.ngf, norm_type=opt.norm_type) T_Zb2Za = LatentTranslator(n_channels=256, norm_type=opt.norm_type, use_dropout=not opt.no_dropout) D_b = Discriminator(input_nc=opt.input_nc, ndf=opt.ndf, n_layers=opt.n_layers, norm_type=opt.norm_type) E_b2Za = Encoder(input_nc=opt.input_nc, ngf=opt.ngf, norm_type=opt.norm_type, use_dropout=not opt.no_dropout, n_blocks=9) G_Za2a = Decoder(output_nc=opt.output_nc, ngf=opt.ngf, norm_type=opt.norm_type) T_Za2Zb = LatentTranslator(n_channels=256, norm_type=opt.norm_type, use_dropout=not opt.no_dropout) D_a = Discriminator(input_nc=opt.input_nc, ndf=opt.ndf, n_layers=opt.n_layers, norm_type=opt.norm_type) ## initialization E_a2Zb = init_net(E_a2Zb, init_type=opt.init_type).to(device) G_Zb2b = init_net(G_Zb2b, init_type=opt.init_type).to(device) T_Zb2Za = init_net(T_Zb2Za, init_type=opt.init_type).to(device) D_b = init_net(D_b, init_type=opt.init_type).to(device) E_b2Za = init_net(E_b2Za, init_type=opt.init_type).to(device) G_Za2a = init_net(G_Za2a, init_type=opt.init_type).to(device) T_Za2Zb = init_net(T_Za2Zb, init_type=opt.init_type).to(device) D_a = init_net(D_a, init_type=opt.init_type).to(device) print( "+------------------------------------------------------+\nFinish initializing networks." ) #### optimizer and criterion ## criterion criterionGAN = GANLoss(opt.gan_mode).to(device) criterionZId = nn.L1Loss() criterionIdt = nn.L1Loss() criterionCTC = nn.L1Loss() criterionZCyc = nn.L1Loss() ## optimizer optimizer_G = torch.optim.Adam(itertools.chain(E_a2Zb.parameters(), G_Zb2b.parameters(), T_Zb2Za.parameters(), E_b2Za.parameters(), G_Za2a.parameters(), T_Za2Zb.parameters()), lr=opt.lr, betas=(opt.beta1, opt.beta2)) optimizer_D = torch.optim.Adam(itertools.chain(D_a.parameters(), D_b.parameters()), lr=opt.lr, betas=(opt.beta1, opt.beta2)) ## scheduler scheduler = [ get_scheduler(optimizer_G, opt), get_scheduler(optimizer_D, opt) ] print( "+------------------------------------------------------+\nFinish initializing the optimizers and criterions." ) #### global variables checkpoints_pth = os.path.join(opt.checkpoints, opt.name) if os.path.exists(checkpoints_pth) is not True: os.mkdir(checkpoints_pth) os.mkdir(os.path.join(checkpoints_pth, 'images')) record_fh = open(os.path.join(checkpoints_pth, 'records.txt'), 'w', encoding='utf-8') loss_names = [ 'GAN_A', 'Adv_A', 'Idt_A', 'CTC_A', 'ZId_A', 'ZCyc_A', 'GAN_B', 'Adv_B', 'Idt_B', 'CTC_B', 'ZId_B', 'ZCyc_B' ] fake_A_pool = ImagePool( opt.pool_size ) # create image buffer to store previously generated images fake_B_pool = ImagePool( opt.pool_size ) # create image buffer to store previously generated images print( "+------------------------------------------------------+\nFinish preparing the other works." ) print( "+------------------------------------------------------+\nNow training is beginning .." ) #### training cur_iter = 0 for epoch in range(opt.epoch_count, opt.niter + opt.niter_decay + 1): epoch_start_time = time.time() # timer for entire epoch for i, data in enumerate(data_set): ## setup inputs real_A = data['A'].to(device) real_B = data['B'].to(device) ## forward # image cycle / GAN latent_B = E_a2Zb(real_A) #-> a -> Zb : E_a2b(a) fake_B = G_Zb2b(latent_B) #-> Zb -> b' : G_b(E_a2b(a)) latent_A = E_b2Za(real_B) #-> b -> Za : E_b2a(b) fake_A = G_Za2a(latent_A) #-> Za -> a' : G_a(E_b2a(b)) # Idt ''' rec_A = G_Za2a(E_b2Za(fake_B)) #-> b' -> Za' -> rec_a : G_a(E_b2a(fake_b)) rec_B = G_Zb2b(E_a2Zb(fake_A)) #-> a' -> Zb' -> rec_b : G_b(E_a2b(fake_a)) ''' idt_latent_A = E_b2Za(real_A) #-> a -> Za : E_b2a(a) idt_A = G_Za2a(idt_latent_A) #-> Za -> idt_a : G_a(E_b2a(a)) idt_latent_B = E_a2Zb(real_B) #-> b -> Zb : E_a2b(b) idt_B = G_Zb2b(idt_latent_B) #-> Zb -> idt_b : G_b(E_a2b(b)) # ZIdt T_latent_A = T_Zb2Za(latent_B) #-> Zb -> Za'' : T_b2a(E_a2b(a)) T_rec_A = G_Za2a( T_latent_A) #-> Za'' -> a'' : G_a(T_b2a(E_a2b(a))) T_latent_B = T_Za2Zb(latent_A) #-> Za -> Zb'' : T_a2b(E_b2a(b)) T_rec_B = G_Zb2b( T_latent_B) #-> Zb'' -> b'' : G_b(T_a2b(E_b2a(b))) # CTC T_idt_latent_B = T_Za2Zb(idt_latent_A) #-> a -> T_a2b(E_b2a(a)) T_idt_latent_A = T_Zb2Za(idt_latent_B) #-> b -> T_b2a(E_a2b(b)) # ZCyc TT_latent_B = T_Za2Zb(T_latent_A) #-> T_a2b(T_b2a(E_a2b(a))) TT_latent_A = T_Zb2Za(T_latent_B) #-> T_b2a(T_a2b(E_b2a(b))) ### optimize parameters ## Generator updating set_requires_grad( [D_b, D_a], False) #-> set Discriminator to require no gradient optimizer_G.zero_grad() # GAN loss loss_G_A = criterionGAN(D_b(fake_B), True) loss_G_B = criterionGAN(D_a(fake_A), True) loss_GAN = loss_G_A + loss_G_B # Idt loss loss_idt_A = criterionIdt(idt_A, real_A) loss_idt_B = criterionIdt(idt_B, real_B) loss_Idt = loss_idt_A + loss_idt_B # Latent cross-identity loss loss_Zid_A = criterionZId(T_rec_A, real_A) loss_Zid_B = criterionZId(T_rec_B, real_B) loss_Zid = loss_Zid_A + loss_Zid_B # Latent cross-translation consistency loss_CTC_A = criterionCTC(T_idt_latent_A, latent_A) loss_CTC_B = criterionCTC(T_idt_latent_B, latent_B) loss_CTC = loss_CTC_B + loss_CTC_A # Latent cycle consistency loss_ZCyc_A = criterionZCyc(TT_latent_A, latent_A) loss_ZCyc_B = criterionZCyc(TT_latent_B, latent_B) loss_ZCyc = loss_ZCyc_B + loss_ZCyc_A loss_G = opt.lambda_gan * loss_GAN + opt.lambda_idt * loss_Idt + opt.lambda_zid * loss_Zid + opt.lambda_ctc * loss_CTC + opt.lambda_zcyc * loss_ZCyc # backward and gradient updating loss_G.backward() optimizer_G.step() ## Discriminator updating set_requires_grad([D_b, D_a], True) # -> set Discriminator to require gradient optimizer_D.zero_grad() # backward D_b fake_B_ = fake_B_pool.query(fake_B) #-> real_B, fake_B pred_real_B = D_b(real_B) loss_D_real_B = criterionGAN(pred_real_B, True) pred_fake_B = D_b(fake_B_) loss_D_fake_B = criterionGAN(pred_fake_B, False) loss_D_B = (loss_D_real_B + loss_D_fake_B) * 0.5 loss_D_B.backward() # backward D_a fake_A_ = fake_A_pool.query(fake_A) #-> real_A, fake_A pred_real_A = D_a(real_A) loss_D_real_A = criterionGAN(pred_real_A, True) pred_fake_A = D_a(fake_A_) loss_D_fake_A = criterionGAN(pred_fake_A, False) loss_D_A = (loss_D_real_A + loss_D_fake_A) * 0.5 loss_D_A.backward() # update the gradients optimizer_D.step() ### validate here, both qualitively and quantitatively ## record the losses if cur_iter % opt.log_freq == 0: # loss_names = ['GAN_A', 'Adv_A', 'Idt_A', 'CTC_A', 'ZId_A', 'ZCyc_A', 'GAN_B', 'Adv_B', 'Idt_B', 'CTC_B', 'ZId_B', 'ZCyc_B'] losses = [ loss_G_A.item(), loss_D_A.item(), loss_idt_A.item(), loss_CTC_A.item(), loss_Zid_A.item(), loss_ZCyc_A.item(), loss_G_B.item(), loss_D_B.item(), loss_idt_B.item(), loss_CTC_B.item(), loss_Zid_B.item(), loss_ZCyc_B.item() ] # record line = '' for loss in losses: line += '{} '.format(loss) record_fh.write(line[:-1] + '\n') # print out print('Epoch: %3d/%3dIter: %9d--------------------------+' % (epoch, opt.epoch, i)) field_names = loss_names[:len(loss_names) // 2] table = PrettyTable(field_names=field_names) for l_n in field_names: table.align[l_n] = 'm' table.add_row(losses[:len(field_names)]) print(table.get_string(reversesort=True)) field_names = loss_names[len(loss_names) // 2:] table = PrettyTable(field_names=field_names) for l_n in field_names: table.align[l_n] = 'm' table.add_row(losses[-len(field_names):]) print(table.get_string(reversesort=True)) ## visualize if cur_iter % opt.vis_freq == 0: if opt.gpu_id >= 0: real_A = real_A.cpu().data real_B = real_B.cpu().data fake_A = fake_A.cpu().data fake_B = fake_B.cpu().data idt_A = idt_A.cpu().data idt_B = idt_B.cpu().data T_rec_A = T_rec_A.cpu().data T_rec_B = T_rec_B.cpu().data plt.subplot(241), plt.title('real_A'), plt.imshow( tensor2image_RGB(real_A[0, ...])) plt.subplot(242), plt.title('fake_B'), plt.imshow( tensor2image_RGB(fake_B[0, ...])) plt.subplot(243), plt.title('idt_A'), plt.imshow( tensor2image_RGB(idt_A[0, ...])) plt.subplot(244), plt.title('L_idt_A'), plt.imshow( tensor2image_RGB(T_rec_A[0, ...])) plt.subplot(245), plt.title('real_B'), plt.imshow( tensor2image_RGB(real_B[0, ...])) plt.subplot(246), plt.title('fake_A'), plt.imshow( tensor2image_RGB(fake_A[0, ...])) plt.subplot(247), plt.title('idt_B'), plt.imshow( tensor2image_RGB(idt_B[0, ...])) plt.subplot(248), plt.title('L_idt_B'), plt.imshow( tensor2image_RGB(T_rec_B[0, ...])) plt.savefig( os.path.join(checkpoints_pth, 'images', '%03d_%09d.jpg' % (epoch, i))) cur_iter += 1 #break #-> debug ## till now, we finish one epoch, try to update the learning rate update_learning_rate(schedulers=scheduler, opt=opt, optimizer=optimizer_D) ## save the model if epoch % opt.ckp_freq == 0: #-> save models # torch.save(model.state_dict(), PATH) #-> load in models # model.load_state_dict(torch.load(PATH)) # model.eval() if opt.gpu_id >= 0: E_a2Zb = E_a2Zb.cpu() G_Zb2b = G_Zb2b.cpu() T_Zb2Za = T_Zb2Za.cpu() D_b = D_b.cpu() E_b2Za = E_b2Za.cpu() G_Za2a = G_Za2a.cpu() T_Za2Zb = T_Za2Zb.cpu() D_a = D_a.cpu() ''' torch.save( E_a2Zb.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-E_a2b.pth' % epoch)) torch.save( G_Zb2b.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-G_b.pth' % epoch)) torch.save(T_Zb2Za.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-T_b2a.pth' % epoch)) torch.save( D_b.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-D_b.pth' % epoch)) torch.save( E_b2Za.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-E_b2a.pth' % epoch)) torch.save( G_Za2a.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-G_a.pth' % epoch)) torch.save(T_Za2Zb.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-T_a2b.pth' % epoch)) torch.save( D_a.cpu().state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-D_a.pth' % epoch)) ''' torch.save( E_a2Zb.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-E_a2b.pth' % epoch)) torch.save( G_Zb2b.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-G_b.pth' % epoch)) torch.save( T_Zb2Za.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-T_b2a.pth' % epoch)) torch.save( D_b.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-D_b.pth' % epoch)) torch.save( E_b2Za.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-E_b2a.pth' % epoch)) torch.save( G_Za2a.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-G_a.pth' % epoch)) torch.save( T_Za2Zb.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-T_a2b.pth' % epoch)) torch.save( D_a.state_dict(), os.path.join(checkpoints_pth, 'epoch_%3d-D_a.pth' % epoch)) if opt.gpu_id >= 0: E_a2Zb = E_a2Zb.to(device) G_Zb2b = G_Zb2b.to(device) T_Zb2Za = T_Zb2Za.to(device) D_b = D_b.to(device) E_b2Za = E_b2Za.to(device) G_Za2a = G_Za2a.to(device) T_Za2Zb = T_Za2Zb.to(device) D_a = D_a.to(device) print("+Successfully saving models in epoch: %3d.-------------+" % epoch) #break #-> debug record_fh.close() print("≧◔◡◔≦ Congratulation! Finishing the training!")
def main(): opt = Options().parse() # monai.config.print_config() logging.basicConfig(stream=sys.stdout, level=logging.INFO) # check gpus if opt.gpu_ids != '-1': num_gpus = len(opt.gpu_ids.split(',')) else: num_gpus = 0 print('number of GPU:', num_gpus) # Data loader creation # train images train_images = sorted(glob(os.path.join(opt.images_folder, 'train', 'image*.nii'))) train_segs = sorted(glob(os.path.join(opt.labels_folder, 'train', 'label*.nii'))) train_images_for_dice = sorted(glob(os.path.join(opt.images_folder, 'train', 'image*.nii'))) train_segs_for_dice = sorted(glob(os.path.join(opt.labels_folder, 'train', 'label*.nii'))) # validation images val_images = sorted(glob(os.path.join(opt.images_folder, 'val', 'image*.nii'))) val_segs = sorted(glob(os.path.join(opt.labels_folder, 'val', 'label*.nii'))) # test images test_images = sorted(glob(os.path.join(opt.images_folder, 'test', 'image*.nii'))) test_segs = sorted(glob(os.path.join(opt.labels_folder, 'test', 'label*.nii'))) # augment the data list for training for i in range(int(opt.increase_factor_data)): train_images.extend(train_images) train_segs.extend(train_segs) print('Number of training patches per epoch:', len(train_images)) print('Number of training images per epoch:', len(train_images_for_dice)) print('Number of validation images per epoch:', len(val_images)) print('Number of test images per epoch:', len(test_images)) # Creation of data directories for data_loader train_dicts = [{'image': image_name, 'label': label_name} for image_name, label_name in zip(train_images, train_segs)] train_dice_dicts = [{'image': image_name, 'label': label_name} for image_name, label_name in zip(train_images_for_dice, train_segs_for_dice)] val_dicts = [{'image': image_name, 'label': label_name} for image_name, label_name in zip(val_images, val_segs)] test_dicts = [{'image': image_name, 'label': label_name} for image_name, label_name in zip(test_images, test_segs)] # Transforms list if opt.resolution is not None: train_transforms = [ LoadImaged(keys=['image', 'label']), AddChanneld(keys=['image', 'label']), # ThresholdIntensityd(keys=['image'], threshold=-135, above=True, cval=-135), # CT HU filter # ThresholdIntensityd(keys=['image'], threshold=215, above=False, cval=215), CropForegroundd(keys=['image', 'label'], source_key='image'), # crop CropForeground NormalizeIntensityd(keys=['image']), # augmentation ScaleIntensityd(keys=['image']), # intensity Spacingd(keys=['image', 'label'], pixdim=opt.resolution, mode=('bilinear', 'nearest')), # resolution RandFlipd(keys=['image', 'label'], prob=0.15, spatial_axis=1), RandFlipd(keys=['image', 'label'], prob=0.15, spatial_axis=0), RandFlipd(keys=['image', 'label'], prob=0.15, spatial_axis=2), RandAffined(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, rotate_range=(np.pi / 36, np.pi / 36, np.pi * 2), padding_mode="zeros"), RandAffined(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, rotate_range=(np.pi / 36, np.pi / 2, np.pi / 36), padding_mode="zeros"), RandAffined(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, rotate_range=(np.pi / 2, np.pi / 36, np.pi / 36), padding_mode="zeros"), Rand3DElasticd(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, sigma_range=(5, 8), magnitude_range=(100, 200), scale_range=(0.15, 0.15, 0.15), padding_mode="zeros"), RandGaussianSmoothd(keys=["image"], sigma_x=(0.5, 1.15), sigma_y=(0.5, 1.15), sigma_z=(0.5, 1.15), prob=0.1,), RandAdjustContrastd(keys=['image'], gamma=(0.5, 2.5), prob=0.1), RandGaussianNoised(keys=['image'], prob=0.1, mean=np.random.uniform(0, 0.5), std=np.random.uniform(0, 15)), RandShiftIntensityd(keys=['image'], offsets=np.random.uniform(0,0.3), prob=0.1), SpatialPadd(keys=['image', 'label'], spatial_size=opt.patch_size, method= 'end'), # pad if the image is smaller than patch RandSpatialCropd(keys=['image', 'label'], roi_size=opt.patch_size, random_size=False), ToTensord(keys=['image', 'label']) ] val_transforms = [ LoadImaged(keys=['image', 'label']), AddChanneld(keys=['image', 'label']), # ThresholdIntensityd(keys=['image'], threshold=-135, above=True, cval=-135), # ThresholdIntensityd(keys=['image'], threshold=215, above=False, cval=215), CropForegroundd(keys=['image', 'label'], source_key='image'), # crop CropForeground NormalizeIntensityd(keys=['image']), # intensity ScaleIntensityd(keys=['image']), Spacingd(keys=['image', 'label'], pixdim=opt.resolution, mode=('bilinear', 'nearest')), # resolution SpatialPadd(keys=['image', 'label'], spatial_size=opt.patch_size, method= 'end'), # pad if the image is smaller than patch ToTensord(keys=['image', 'label']) ] else: train_transforms = [ LoadImaged(keys=['image', 'label']), AddChanneld(keys=['image', 'label']), # ThresholdIntensityd(keys=['image'], threshold=-135, above=True, cval=-135), # ThresholdIntensityd(keys=['image'], threshold=215, above=False, cval=215), CropForegroundd(keys=['image', 'label'], source_key='image'), # crop CropForeground NormalizeIntensityd(keys=['image']), # augmentation ScaleIntensityd(keys=['image']), # intensity RandFlipd(keys=['image', 'label'], prob=0.15, spatial_axis=1), RandFlipd(keys=['image', 'label'], prob=0.15, spatial_axis=0), RandFlipd(keys=['image', 'label'], prob=0.15, spatial_axis=2), RandAffined(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, rotate_range=(np.pi / 36, np.pi / 36, np.pi * 2), padding_mode="zeros"), RandAffined(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, rotate_range=(np.pi / 36, np.pi / 2, np.pi / 36), padding_mode="zeros"), RandAffined(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, rotate_range=(np.pi / 2, np.pi / 36, np.pi / 36), padding_mode="zeros"), Rand3DElasticd(keys=['image', 'label'], mode=('bilinear', 'nearest'), prob=0.1, sigma_range=(5, 8), magnitude_range=(100, 200), scale_range=(0.15, 0.15, 0.15), padding_mode="zeros"), RandGaussianSmoothd(keys=["image"], sigma_x=(0.5, 1.15), sigma_y=(0.5, 1.15), sigma_z=(0.5, 1.15), prob=0.1,), RandAdjustContrastd(keys=['image'], gamma=(0.5, 2.5), prob=0.1), RandGaussianNoised(keys=['image'], prob=0.1, mean=np.random.uniform(0, 0.5), std=np.random.uniform(0, 1)), RandShiftIntensityd(keys=['image'], offsets=np.random.uniform(0,0.3), prob=0.1), SpatialPadd(keys=['image', 'label'], spatial_size=opt.patch_size, method= 'end'), # pad if the image is smaller than patch RandSpatialCropd(keys=['image', 'label'], roi_size=opt.patch_size, random_size=False), ToTensord(keys=['image', 'label']) ] val_transforms = [ LoadImaged(keys=['image', 'label']), AddChanneld(keys=['image', 'label']), # ThresholdIntensityd(keys=['image'], threshold=-135, above=True, cval=-135), # ThresholdIntensityd(keys=['image'], threshold=215, above=False, cval=215), CropForegroundd(keys=['image', 'label'], source_key='image'), # crop CropForeground NormalizeIntensityd(keys=['image']), # intensity ScaleIntensityd(keys=['image']), SpatialPadd(keys=['image', 'label'], spatial_size=opt.patch_size, method= 'end'), # pad if the image is smaller than patch ToTensord(keys=['image', 'label']) ] train_transforms = Compose(train_transforms) val_transforms = Compose(val_transforms) # create a training data loader check_train = monai.data.Dataset(data=train_dicts, transform=train_transforms) train_loader = DataLoader(check_train, batch_size=opt.batch_size, shuffle=True, collate_fn=list_data_collate, num_workers=opt.workers, pin_memory=False) # create a training_dice data loader check_val = monai.data.Dataset(data=train_dice_dicts, transform=val_transforms) train_dice_loader = DataLoader(check_val, batch_size=1, num_workers=opt.workers, collate_fn=list_data_collate, pin_memory=False) # create a validation data loader check_val = monai.data.Dataset(data=val_dicts, transform=val_transforms) val_loader = DataLoader(check_val, batch_size=1, num_workers=opt.workers, collate_fn=list_data_collate, pin_memory=False) # create a validation data loader check_val = monai.data.Dataset(data=test_dicts, transform=val_transforms) test_loader = DataLoader(check_val, batch_size=1, num_workers=opt.workers, collate_fn=list_data_collate, pin_memory=False) # build the network if opt.network is 'nnunet': net = build_net() # nn build_net elif opt.network is 'unetr': net = build_UNETR() # UneTR net.cuda() if num_gpus > 1: net = torch.nn.DataParallel(net) if opt.preload is not None: net.load_state_dict(torch.load(opt.preload)) dice_metric = DiceMetric(include_background=True, reduction="mean", get_not_nans=False) post_trans = Compose([EnsureType(), Activations(sigmoid=True), AsDiscrete(threshold_values=True)]) loss_function = monai.losses.DiceCELoss(sigmoid=True) torch.backends.cudnn.benchmark = opt.benchmark if opt.network is 'nnunet': optim = torch.optim.SGD(net.parameters(), lr=opt.lr, momentum=0.99, weight_decay=3e-5, nesterov=True,) net_scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=lambda epoch: (1 - epoch / opt.epochs) ** 0.9) elif opt.network is 'unetr': optim = torch.optim.AdamW(net.parameters(), lr=1e-4, weight_decay=1e-5) # start a typical PyTorch training val_interval = 1 best_metric = -1 best_metric_epoch = -1 epoch_loss_values = list() writer = SummaryWriter() for epoch in range(opt.epochs): print("-" * 10) print(f"epoch {epoch + 1}/{opt.epochs}") net.train() epoch_loss = 0 step = 0 for batch_data in train_loader: step += 1 inputs, labels = batch_data["image"].cuda(), batch_data["label"].cuda() optim.zero_grad() outputs = net(inputs) loss = loss_function(outputs, labels) loss.backward() optim.step() epoch_loss += loss.item() epoch_len = len(check_train) // train_loader.batch_size print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}") writer.add_scalar("train_loss", loss.item(), epoch_len * epoch + step) epoch_loss /= step epoch_loss_values.append(epoch_loss) print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}") if opt.network is 'nnunet': update_learning_rate(net_scheduler, optim) if (epoch + 1) % val_interval == 0: net.eval() with torch.no_grad(): def plot_dice(images_loader): val_images = None val_labels = None val_outputs = None for data in images_loader: val_images, val_labels = data["image"].cuda(), data["label"].cuda() roi_size = opt.patch_size sw_batch_size = 4 val_outputs = sliding_window_inference(val_images, roi_size, sw_batch_size, net) val_outputs = [post_trans(i) for i in decollate_batch(val_outputs)] dice_metric(y_pred=val_outputs, y=val_labels) # aggregate the final mean dice result metric = dice_metric.aggregate().item() # reset the status for next validation round dice_metric.reset() return metric, val_images, val_labels, val_outputs metric, val_images, val_labels, val_outputs = plot_dice(val_loader) # Save best model if metric > best_metric: best_metric = metric best_metric_epoch = epoch + 1 torch.save(net.state_dict(), "best_metric_model.pth") print("saved new best metric model") metric_train, train_images, train_labels, train_outputs = plot_dice(train_dice_loader) metric_test, test_images, test_labels, test_outputs = plot_dice(test_loader) # Logger bar print( "current epoch: {} Training dice: {:.4f} Validation dice: {:.4f} Testing dice: {:.4f} Best Validation dice: {:.4f} at epoch {}".format( epoch + 1, metric_train, metric, metric_test, best_metric, best_metric_epoch ) ) writer.add_scalar("Mean_epoch_loss", epoch_loss, epoch + 1) writer.add_scalar("Testing_dice", metric_test, epoch + 1) writer.add_scalar("Training_dice", metric_train, epoch + 1) writer.add_scalar("Validation_dice", metric, epoch + 1) # plot the last model output as GIF image in TensorBoard with the corresponding image and label # val_outputs = (val_outputs.sigmoid() >= 0.5).float() plot_2d_or_3d_image(val_images, epoch + 1, writer, index=0, tag="validation image") plot_2d_or_3d_image(val_labels, epoch + 1, writer, index=0, tag="validation label") plot_2d_or_3d_image(val_outputs, epoch + 1, writer, index=0, tag="validation inference") plot_2d_or_3d_image(test_images, epoch + 1, writer, index=0, tag="test image") plot_2d_or_3d_image(test_labels, epoch + 1, writer, index=0, tag="test label") plot_2d_or_3d_image(test_outputs, epoch + 1, writer, index=0, tag="test inference") print(f"train completed, best_metric: {best_metric:.4f} at epoch: {best_metric_epoch}") writer.close()
loss_i_per_epoch_list.append(loss_i_total.item()) loss_r_per_epoch_list.append(loss_r_total.item()) loss_i_total.backward() loss_r_total.backward() optimizer_i.step() optimizer_r.step() print( "===> Epoch[{}]({}/{}): loss_i_l1: {:.4f}, loss_r_l1: {:.4f}, loss_i_l2: {:.2f}, loss_r_l2: {:.2f}, loss_i_msssim: {:.4f}, loss_r_msssim: {:.4f}, loss_tv: {:.4f}, loss_content: {:.4f}" .format(epoch, iteration, len(training_data_loader), loss_i_l1.item(), loss_r_l1.item(), loss_i_l2.item(), loss_r_l2.item(), loss_i_msssim.item(), loss_r_msssim.item(), loss_tv.item(), loss_content.item())) update_learning_rate(net_i_scheduler, optimizer_i) update_learning_rate(net_r_scheduler, optimizer_r) loss_i_per_epoch = np.mean(loss_i_per_epoch_list) loss_r_per_epoch = np.mean(loss_r_per_epoch_list) loss_i_list.append(loss_i_per_epoch) loss_r_list.append(loss_r_per_epoch) #checkpoint, evalute images, and loss graph if epoch % 10 == 0: # test avg_psnr = 0 i = 0 for batch in testing_data_loader: # input, target = batch[0].to(device), batch[1].to(device)
def main(): print(f"epoch: {opt.niter+opt.niter_decay}") print(f"cuda: {opt.cuda}") print(f"dataset: {opt.dataset}") print(f"output: {opt.output_path}") if opt.cuda and not torch.cuda.is_available(): raise Exception("No GPU found, please run without --cuda") cudnn.benchmark = True torch.manual_seed(opt.seed) if opt.cuda: torch.cuda.manual_seed(opt.seed) print('Loading datasets') train_set = get_training_set(root_path + opt.dataset, opt.direction) test_set = get_test_set(root_path + opt.dataset, opt.direction) training_data_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batch_size, shuffle=True) testing_data_loader = DataLoader(dataset=test_set, num_workers=opt.threads, batch_size=opt.test_batch_size, shuffle=False) device = torch.device("cuda:0" if opt.cuda else "cpu") print('Building models') net_g = define_G(opt.input_nc, opt.output_nc, opt.g_ch, len(class_name_array), 'batch', False, 'normal', 0.02, gpu_id=device) net_d = define_D(opt.input_nc + opt.output_nc, opt.d_ch, len(class_name_array), 'basic', gpu_id=device) criterionGAN = GANLoss().to(device) criterionL1 = nn.L1Loss().to(device) criterionMSE = nn.MSELoss().to(device) # setup optimizer optimizer_g = optim.Adam(net_g.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizer_d = optim.Adam(net_d.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) net_g_scheduler = get_scheduler(optimizer_g, opt) net_d_scheduler = get_scheduler(optimizer_d, opt) start_time = time.time() #save loss G_loss_array = [] D_loss_array = [] epoch_array = [] for epoch in tqdm(range(opt.epoch_count, opt.niter + opt.niter_decay + 1), desc="Epoch"): # train loss_g_sum = 0 loss_d_sum = 0 for iteration, batch in enumerate(tqdm(training_data_loader, desc="Batch"), 1): # forward real_a, real_b, class_label, _ = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3][0] fake_b = net_g(real_a, class_label) ###################### # (1) Update D network ###################### optimizer_d.zero_grad() # train with fake if opt.padding: real_a_for_d = padding(real_a) real_b_for_d = padding(real_b) fake_b_for_d = padding(fake_b) else: real_a_for_d = real_a real_b_for_d = real_b fake_b_for_d = fake_b fake_ab = torch.cat((real_a_for_d, fake_b_for_d), 1) pred_fake = net_d.forward(fake_ab.detach(), class_label) loss_d_fake = criterionGAN(pred_fake, False) # train with real real_ab = torch.cat((real_a_for_d, real_b_for_d), 1) pred_real = net_d.forward(real_ab, class_label) loss_d_real = criterionGAN(pred_real, True) # Combined D loss loss_d = (loss_d_fake + loss_d_real) * 0.5 loss_d.backward() optimizer_d.step() ###################### # (2) Update G network ###################### optimizer_g.zero_grad() # First, G(A) should fake the discriminator fake_ab = torch.cat((real_a_for_d, fake_b_for_d), 1) pred_fake = net_d.forward(fake_ab, class_label) loss_g_gan = criterionGAN(pred_fake, True) # Second, G(A) = B loss_g_l1 = criterionL1(fake_b, real_b) * opt.lamb loss_g = loss_g_gan + loss_g_l1 loss_g.backward() optimizer_g.step() loss_d_sum += loss_d.item() loss_g_sum += loss_g.item() update_learning_rate(net_g_scheduler, optimizer_g) update_learning_rate(net_d_scheduler, optimizer_d) # test avg_psnr = 0 dst = Image.new('RGB', (512*4, 256*4)) n = 0 for batch in tqdm(testing_data_loader, desc="Batch"): input, target, class_label, _ = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3][0] prediction = net_g(input, class_label) mse = criterionMSE(prediction, target) psnr = 10 * log10(1 / mse.item()) avg_psnr += psnr n += 1 if n <= 16: #make test preview out_img = prediction.detach().squeeze(0).cpu() image_numpy = out_img.float().numpy() image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0 image_numpy = image_numpy.clip(0, 255) image_numpy = image_numpy.astype(np.uint8) image_pil = Image.fromarray(image_numpy) dst.paste(image_pil, ((n-1)%4*512, (n-1)//4*256)) if not os.path.exists("results"): os.mkdir("results") if not os.path.exists(os.path.join("results", opt.output_path)): os.mkdir(os.path.join("results", opt.output_path)) dst.save(f"results/{opt.output_path}/epoch{epoch}_test_preview.jpg") epoch_array += [epoch] G_loss_array += [loss_g_sum/len(training_data_loader)] D_loss_array += [loss_d_sum/len(training_data_loader)] if opt.graph_save_while_training and len(epoch_array) > 1: output_graph(epoch_array, G_loss_array, D_loss_array, False) #checkpoint if epoch % opt.save_interval == 0: if not os.path.exists("checkpoint"): os.mkdir("checkpoint") if not os.path.exists(os.path.join("checkpoint", opt.output_path)): os.mkdir(os.path.join("checkpoint", opt.output_path)) net_g_model_out_path = "checkpoint/{}/netG_model_epoch_{}.pth".format(opt.output_path, epoch) net_d_model_out_path = "checkpoint/{}/netD_model_epoch_{}.pth".format(opt.output_path, epoch) torch.save(net_g, net_g_model_out_path) torch.save(net_d, net_d_model_out_path) #save the latest net if not os.path.exists("checkpoint"): os.mkdir("checkpoint") if not os.path.exists(os.path.join("checkpoint", opt.output_path)): os.mkdir(os.path.join("checkpoint", opt.output_path)) net_g_model_out_path = "checkpoint/{}/netG_model_epoch_{}.pth".format(opt.output_path, opt.niter + opt.niter_decay) net_d_model_out_path = "checkpoint/{}/netD_model_epoch_{}.pth".format(opt.output_path, opt.niter + opt.niter_decay) torch.save(net_g, net_g_model_out_path) torch.save(net_d, net_d_model_out_path) print("\nCheckpoint saved to {}".format("checkpoint/" + opt.output_path)) # output loss graph output_graph(epoch_array, G_loss_array, D_loss_array) # finish training now_time = time.time() t = now_time - start_time print(f"Training time: {t/60:.1f}m")