def train(args, train_dataloader, valid_dataloader): if str(args.model).lower() == 'fcn32s': model = VGG16_FCN32s(n_classes=7) elif str(args.model).lower() == 'fcn8s': model = VGG16_FCN8s(n_classes=7) else: model = UNet(n_channels=3, n_classes=7) #model = nn.DataParallel(model, device_ids=['cuda:0','cuda:1']) model.to(args.device) # loss # 0.79, 0.14, 1.0, 0.73, 2.74, 1.04, 132, 0 weight = torch.tensor([0.79, 0.14, 1.0, 0.73, 2.74, 1.04, 1.0]) criterion = nn.CrossEntropyLoss(weight).to(args.device) # optim optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) if str(args.model) == 'fcn32s': milestones = [1, 10, 20, 50] elif str(args.model) == 'fcn8s': milestones = [1, 10, 20, 60] else: milestones = [25, 50, 80] train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=0.2) #learning rate decay best_iou = 0 for epoch in range(args.epochs): print(f"\tEpoch {epoch}") loss, acc, iou = _run_train(args, train_dataloader, model, criterion, optimizer) print("\t train loss:{:.5f}, acc:{:.3f}, iou:{:.2f}".format( loss, acc, iou)) loss, acc, iou = _run_eval(args, valid_dataloader, model, criterion) print("\t valid loss:{:.5f}, acc:{:.3f}, iou:{:.2f}".format( loss, acc, iou)) if epoch in milestones: torch.save(model.state_dict(), f"./result/{epoch}_{args.model}.pth") print('\t [Info] save weights') if epoch > milestones[1] and iou > best_iou: best_iou = iou torch.save(model.state_dict(), f"./result/best_{args.model}.pth") print('\t [Info] save weights')
def main(): train_dataset = MHP('/root/dataset/LV-MHP-v2/train', n_classes=59) train_loader = DataLoader(dataset=train_dataset, batch_size=12, shuffle=True, num_workers=0) model = UNet(n_channels=3, n_classes=59).cuda() optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() writer = tbx.SummaryWriter(log_dir="logs") n_epochs = 10000 for epoch in range(n_epochs): train_epoch(train_loader, model, criterion, optimizer, epoch, writer) state = {'state_dict': model.state_dict()} filename = 'checkpoints/{0:05d}.pth.tar'.format(epoch) torch.save(state, filename)
writer = SummaryWriter(args.tensorboard) # --------------------------- using pre-trained params ---------------------------------- # # (1) get param from pre-trained model # from unet_3up_ab_toge.unet.unet_model import UNet as UNet_old # from unet_3up_ab.unet_model import UNet as UNet_old from step2_add_bd_branch.unet.unet_model import UNet as UNet_old net_old = UNet_old(n_channels=3, n_classes=1) net_old.load_state_dict(torch.load('../step2_add_bd_branch/step2_checkpoints/CP196.pth')) net_old_dict = net_old.state_dict() # (2) our new model net = UNet(n_channels=3, n_classes=1) net_dict = net.state_dict() # # (3) apply pre-trained params in new model net_old_dict = {k: v for k, v in net_old_dict.items() if k in net_dict} net_dict.update(net_old_dict) # update params using pre-trained model net.load_state_dict(net_dict) # update the model if have_gpu and args.gpu: print('Using GPU !') net = net.cuda() try: train_net(image_dir=args.imagedir, label_dir=args.gt, boundary_dir=args.bd, checkpoint_dir=args.checkpoint,
def main(): """ Main training loop. """ parser = ArgumentParser() parser = UNet.add_model_specific_args(parser) parser = Trainer.add_argparse_args(parser) args = parser.parse_args() prod = bool(os.getenv("PROD")) logging.getLogger(__name__).setLevel(logging.INFO) if prod: logging.info( "Training i production mode, disabling all debugging APIs") torch.autograd.set_detect_anomaly(False) torch.autograd.profiler.profile(enabled=False) torch.autograd.profiler.emit_nvtx(enabled=False) else: logging.info("Training i development mode, debugging APIs active.") torch.autograd.set_detect_anomaly(True) torch.autograd.profiler.profile(enabled=True, use_cuda=True, record_shapes=True, profile_memory=True) torch.autograd.profiler.emit_nvtx(enabled=True, record_shapes=True) model = UNet(**vars(args)) logging.info( f"Network:\n" f"\t{model.hparams.n_channels} input channels\n" f"\t{model.hparams.n_classes} output channels (classes)\n" f'\t{"Bilinear" if model.hparams.bilinear else "Transposed conv"} upscaling' ) cudnn.benchmark = True # cudnn Autotuner cudnn.enabled = True # look for optimal algorithms early_stop_callback = EarlyStopping( monitor="val_loss", min_delta=0.00, mode="min", patience=10 if not os.getenv("EARLY_STOP") else int( os.getenv("EARLY_STOP")), verbose=True, ) lr_monitor = LearningRateMonitor() run_name = "{}_LR{}_BS{}_IS{}".format( datetime.now().strftime("%d-%m-%Y-%H-%M-%S"), args.lr, args.batch_size, args.image_size, ).replace(".", "_") log_folder = ("./logs" if not os.getenv("DIR_ROOT_DIR") else os.getenv("DIR_ROOT_DIR")) if not os.path.isdir(log_folder): os.mkdir(log_folder) logger = TensorBoardLogger(log_folder, name=run_name) checkpoint_callback = ModelCheckpoint( monitor='val_loss', dirpath='./checkpoints', filename='unet-{epoch:02d}-{val_loss:.2f}', save_top_k=3, mode='min', ) try: trainer = Trainer.from_argparse_args( args, gpus=-1, accelerator="ddp", plugins=DDPPlugin(find_unused_parameters=False), precision=16, auto_lr_find="learning_rate" if float(os.getenv("LRN_RATE")) == 0.0 else False, logger=logger, callbacks=[early_stop_callback, lr_monitor, checkpoint_callback], accumulate_grad_batches=1.0 if not os.getenv("ACC_GRAD") else int( os.getenv("ACC_GRAD")), gradient_clip_val=0.0 if not os.getenv("GRAD_CLIP") else float( os.getenv("GRAD_CLIP")), max_epochs=100 if not os.getenv("EPOCHS") else int( os.getenv("EPOCHS")), val_check_interval=0.1 if not os.getenv("VAL_INT_PER") else float( os.getenv("VAL_INT_PER")), default_root_dir=os.getcwd() if not os.getenv("DIR_ROOT_DIR") else os.getenv("DIR_ROOT_DIR"), fast_dev_run=True if os.getenv("FAST_DEV_RUN") == "True" else False, ) if float(os.getenv("LRN_RATE")) == 0.0: trainer.tune(model) trainer.fit(model) trainer.test(model) except KeyboardInterrupt: torch.save(model.state_dict(), "INTERRUPTED.pth") logging.info("Saved interrupt") try: sys.exit(0) except SystemExit: os._exit(0)