def train(args): Arguments.save_args(args, args.args_path) train_loader, val_loader, _ = get_dataloaders(args) model = UNetVgg16(n_classes=args.n_classes).to(args.device) optimizer = get_optimizer(args.optimizer, model) lr_scheduler = LRScheduler(args.lr_scheduler, optimizer) criterion = get_loss_fn(args.loss_type, args.ignore_index).to(args.device) model_saver = ModelSaver(args.model_path) recorder = Recorder(['train_miou', 'train_acc', 'train_loss', 'val_miou', 'val_acc', 'val_loss']) for epoch in range(args.n_epochs): print(f"{args.experim_name} Epoch {epoch+1}:") train_loss, train_acc, train_miou, train_ious = train_epoch( model=model, dataloader=train_loader, n_classes=args.n_classes, optimizer=optimizer, lr_scheduler=lr_scheduler, criterion=criterion, device=args.device, ) print(f"train | mIoU: {train_miou:.3f} | accuracy: {train_acc:.3f} | loss: {train_loss:.3f}") val_loss, val_scores = eval_epoch( model=model, dataloader=val_loader, n_classes=args.n_classes, criterion=criterion, device=args.device, ) val_miou, val_ious, val_acc = val_scores['mIoU'], val_scores['IoUs'], val_scores['accuracy'] print(f"valid | mIoU: {val_miou:.3f} | accuracy: {val_acc:.3f} | loss: {val_loss:.3f}") recorder.update([train_miou, train_acc, train_loss, val_miou, val_acc, val_loss]) recorder.save(args.record_path) if args.metric.startswith("IoU"): metric = val_ious[int(args.metric.split('_')[1])] else: metric = val_miou model_saver.save_models(metric, epoch+1, model, ious={'train': train_ious, 'val': val_ious}) print(f"best model at epoch {model_saver.best_epoch} with miou {model_saver.best_score:.5f}")
def main(args): # Defines configuration dictionary and network architecture to use config = get_config(args.dataset, args.version) method = config['model'] # Defines the loss function. Takes a tensor as argument to initiate class balancing, # which can be obtained from the balance script. Uncomment argument below. if config['balance'] and args.gpu and torch.cuda.is_available(): criterion = nn.CrossEntropyLoss(weight=balance(config)).cuda() elif config['balance']: criterion = nn.CrossEntropyLoss(weight=balance(config)) elif args.gpu and torch.cuda.is_available(): criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() # Maps configuration method to network class defined in models.py try: if args.gpu and torch.cuda.is_available(): model = model_mappings[method](K=config['n_class']).cuda() else: model = model_mappings[method](K=config['n_class']) except KeyError: print('%s model does not exist' % method) sys.exit(1) if args.mode == 'train': # Starts training time to be completed at end of conditional statement start = time.time() # Defines directory for trained network, training log, and training plot # respectively; create these directories in MatSeg if this is not already done. model_dir = './saved/%s_%s.pth' % (config['name'], method) log_dir = './log/%s_%s.log' % (config['name'], method) plot_dir = './plots/%s_%s.png' % (config['name'], method) # Obtains iterable data sets from function above train_loader, validation_loader = get_dataloader(config) # Conditional outlining choice of optimizer; includes hard-coded hyperparameters if config['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=5e-4) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4) else: print('cannot found %s optimizer' % config['optimizer']) sys.exit(1) # Defines dynamic learning rate reduction. Patience defines the number of epochs after # which to reduce the LR should training loss not decrease in those epochs. scheduler = ReduceLROnPlateau(optimizer, patience=config['patience']) # Gives entries in the Recorder object to measure; obtained from evaluate function recorder = Recorder(('loss_train', 'acc_train', 'loss_val', 'acc_val', 'mean_iou', 'class_precision', 'class_iou')) iou_val_max = 0 # Iterate through number of epochs for epoch in range(1, config['epoch'] + 1): gc.collect() print('Epoch %s:' % epoch) loss_train, acc_train = train(config, model, criterion, optimizer, train_loader, method=method, gpu=args.gpu) loss_val, acc_val, iou_val, class_precision, class_iou = evaluate( config, model, criterion, validation_loader, gpu=args.gpu, method=method) # Update learning rate scheduler based on training loss scheduler.step(loss_train) # Update metrics in Recorder object for each epoch recorder.update((loss_train, acc_train, loss_val, acc_val, iou_val, class_precision, class_iou)) # Save model with higher mean IoU if iou_val > iou_val_max and args.save: torch.save(recorder.record, log_dir) torch.save( { 'epoch': epoch, 'version': args.version, 'model_state_dict': model.state_dict(), }, model_dir) print( 'validation iou improved from %.5f to %.5f. Model Saved.' % (iou_val_max, iou_val)) iou_val_max = iou_val # Stop training if learning rate is reduced three times or (commented out) if validation loss # loss does not decrease for 20 epochs. Otherwise, continue training. if (optimizer.param_groups[0]['lr'] / config['lr']) <= 1e-3: print('Learning Rate Reduced to 1e-3 of Original Value', 'Training Stopped', sep='\n') epochs = epoch break # elif all(recorder['loss_val'][-20:][i] <= recorder['loss_val'][-20:][i+1] for i in range(19)): # print('Loss has not decreased for previous 20 epochs', 'Training Stopped', sep='\n') # epochs = epoch # break else: epochs = epoch continue # Obtain time after all epochs, compute total training time, print and plot results end = time.time() time_taken = end - start print(recorder.record) plotting(recorder.record, config, start, time_taken, plot_dir, epochs) elif args.mode == 'evaluate': # Load test data into and iterable dataset with no augmentation and verbose metrics test_dir = '%s/%s' % (config['root'], args.test_folder) test_set = Dataset(test_dir, config['size'], *get_transform(config, is_train=False)) test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, drop_last=False) # Load desired trained network from saved directory model_dir = './saved/%s_%s.pth' % (config['name'], method) model.load_state_dict(torch.load(model_dir)['model_state_dict']) # Define directories to which to save predictions and overlays respectively, and create them if necessary save_dir = '%s/predictions/%s_%s' % (test_dir, args.version, method) overlay_dir = '%s/overlays/%s_%s' % (test_dir, args.version, method) labels_dir = os.path.join(test_dir, 'labels_npy') if not os.path.isdir('%s/predictions' % test_dir): os.mkdir('%s/predictions' % test_dir) if not os.path.isdir(save_dir): os.mkdir(save_dir) evaluate(config, model, criterion, test_loader, gpu=args.gpu, method=method, test_flag=True, save_dir=save_dir) # Creates overlays if this is specified in the command line if os.path.isdir(labels_dir) and args.overlay: if not os.path.isdir(overlay_dir): os.makedirs(overlay_dir) overlay(labels_dir, save_dir, overlay_dir, config['n_class']) else: print('%s mode does not exist' % args.mode)
def main(args): if args.seed: np.random.seed(int(args.seed)) torch.backends.cudnn.deterministic = True torch.manual_seed(0) config = get_config(args.dataset, args.version) method = config['model'] criterion = nn.CrossEntropyLoss().cuda() try: model = model_mappings[method](K=config['n_class']).cuda() except KeyError: print('%s model does not exist' % method) sys.exit(1) model_dir = './saved/%s_%s.pth' % (config['name'], method) if args.mode == 'train': log_dir = './log/%s_%s.log' % (config['name'], method) train_loader, validation_loader = get_dataloader(config) if config['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=5e-4) elif config['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=5e-4) else: print('cannot found %s optimizer' % config['optimizer']) sys.exit(1) scheduler = ReduceLROnPlateau(optimizer, patience=3) recorder = Recorder(('loss_train', 'acc_train', 'loss_val', 'acc_val')) iou_val_max = 0 for epoch in range(1, config['epoch'] + 1): print('Epoch %s:' % epoch) loss_train, acc_train = train(config, model, criterion, optimizer, train_loader, method=method) loss_val, acc_val, iou_val = evaluate(config, model, criterion, validation_loader, method=method) scheduler.step(loss_train) # update loss and accuracy per epoch recorder.update((loss_train, acc_train, loss_val, acc_val)) # save model with higher iou if iou_val > iou_val_max and args.save: torch.save(recorder.record, log_dir) torch.save( { 'epoch': epoch, 'version': args.version, 'model_state_dict': model.state_dict(), }, model_dir) print( 'validation iou improved from %.5f to %.5f. Model Saved.' % (iou_val_max, iou_val)) iou_val_max = iou_val elif args.mode == 'evaluate': test_dir = '%s/%s' % (config['root'], args.test_folder) test_set = Dataset(test_dir, config['size'], *get_transform(config, is_train=False)) test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, drop_last=False) model.load_state_dict(torch.load(model_dir)['model_state_dict']) # save prediction results, make directory if not exists save_dir = '%s/predictions/%s_%s' % (test_dir, args.version, method) if not os.path.isdir('%s/predictions' % test_dir): os.mkdir('%s/predictions' % test_dir) if not os.path.isdir(save_dir): os.mkdir(save_dir) evaluate(config, model, criterion, test_loader, method=method, test_flag=True, save_dir=save_dir) else: print('%s mode does not exist' % args.mode)