def main(args): """Main function for the training pipeline :args: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(model_dir, exist_ok=True) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(log_dir) train_dataset = dataset.NCovDataset('data/', stage='train') weights = train_dataset.make_weights_for_balanced_classes() weights = torch.DoubleTensor(weights) sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(train_dataset.case_ids)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, num_workers=20, drop_last=False, sampler=sampler) val_dataset = dataset.NCovDataset('data/', stage='val') val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=11, drop_last=False) cov_net = model.COVNet(n_classes=3) if torch.cuda.is_available(): cov_net = cov_net.cuda() optimizer = optim.Adam(cov_net.parameters(), lr=args.lr, weight_decay=0.1) if args.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif args.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=args.gamma) best_val_loss = float('inf') best_val_accu = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(args.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(cov_net, train_loader, epoch, args.epochs, optimizer, writer, current_lr, args.log_every) with torch.no_grad(): val_loss, val_metric = evaluate_model(cov_net, val_loader, epoch, args.epochs, writer, current_lr) ############################## # Adjust the learning rate # ############################## if args.lr_scheduler == 'plateau': scheduler.step(val_loss) elif args.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start utils.print_epoch_progress(train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] file_name = ('train_acc_{}_val_acc_{}_epoch_{}.pth'.format( train_acc, val_acc, epoch)) torch.save(cov_net, os.path.join(model_dir, file_name)) if val_acc > best_val_accu: best_val_accu = val_acc if bool(args.save_model): torch.save(cov_net, os.path.join(model_dir, 'best.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if iteration_change_loss == args.patience: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print('training took {}s'.format(t_end_training - t_start_training))
def main(opts): """Main function for the training pipeline :opts: commandline arguments :returns: None """ pprint(vars(opts)) ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(model_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, opts.run_name), exist_ok=True) with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name)) train_dataset = SegmentationDataset(is_train=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=4, drop_last=False, shuffle=True) val_dataset = SegmentationDataset(is_train=False) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=20, shuffle=False, num_workers=0, drop_last=False) model = load_model(opts, n_classes=4) if torch.cuda.is_available(): model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1) if opts.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=opts.patience, factor=.3, threshold=0.1, verbose=True) elif opts.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=opts.gamma) best_val_loss = float('inf') best_val_accu = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, epoch, optimizer, writer, opts) with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, epoch, writer, opts) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start utils.print_epoch_progress(epoch, opts.epochs, train_loss, val_loss, delta, train_metric, val_metric) t_end_training = time.time() print('training took {}s'.format(t_end_training - t_start_training))
def main(args): print("The parameters are set as follows:") print(args) # 首先处理文件保存路径 exp_dir = 'experiments' model_dir = os.path.join(exp_dir, 'models') os.makedirs(model_dir, exist_ok=True) batch_size = args.batch_size train_set = SliceSet2D() validation_split = .4 shuffle_dataset = True random_seed = 42 set_size = len(train_set) indices = list(range(set_size)) split = int(np.floor(validation_split * set_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] print("Set Size: {}|Train Size: {}| Validation Size: {}".format( len(indices), len(train_indices), len(val_indices))) print("Model will be saved in \" ./experiments/model\". Start training...") # Creating data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=valid_sampler) cov_net = model.COVNet(3) if torch.cuda.is_available(): cov_net = cov_net.cuda() optimizer = Adam(cov_net.parameters(), lr=args.lr, weight_decay=0.1) if args.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif args.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=args.gamma) best_val_loss = float('inf') best_val_accu = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## epochs = args.epochs for epoch in range(epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(cov_net, train_loader, epoch, epochs, optimizer, current_lr, 100) with torch.no_grad(): val_loss, val_metric = evaluate_model(cov_net, validation_loader, epoch, epochs, current_lr) ############################## # Adjust the learning rate # ############################## if args.lr_scheduler == 'plateau': scheduler.step(val_loss) elif args.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start utils.print_epoch_progress(epoch, epochs, train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 60) train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] if args.save_model is True: file_name = ('train_acc_{}_val_acc_{}_epoch_{}.pth'.format( train_acc, val_acc, epoch)) torch.save(cov_net, os.path.join(model_dir, file_name)) if val_acc > best_val_accu: best_val_accu = val_acc if bool(args.save_model): torch.save(cov_net, os.path.join(model_dir, 'best.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if iteration_change_loss == args.patience: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print('training took {}s'.format(t_end_training - t_start_training))
def main(opts): """Main function for the training pipeline :opts: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True) os.makedirs(os.path.join(log_dir, opts.run_name)) pprint(vars(opts)) with open(os.path.join(log_dir, opts.run_name, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) torch.manual_seed(opts.seed) np.random.seed(opts.seed) random.seed(opts.seed) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5) if opts.train_mode == 'combined': train_dataset = get_train_dataset(opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) elif opts.train_mode == 'oversampling': train_dataset = get_train_dataset_by_oversampling( opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) elif opts.train_mode == 'pretrain_and_finetune': train_dataset, finetune_dataset = get_pretrain_and_finetune_datast( opts.data_root, opts, opts.folder1, opts.folder2, opts.folder3) finetune_loader = torch.utils.data.DataLoader( finetune_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers, drop_last=False, shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=opts.num_workers, drop_last=False, shuffle=True) val_dataset = get_val_dataset(os.path.join('data', 'val'), opts) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.eval_batch_size, shuffle=False, num_workers=opts.num_workers, drop_last=False) test_dataset = get_test_dataset(os.path.join('data', 'test'), opts) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opts.eval_batch_size, shuffle=False, num_workers=opts.num_workers, drop_last=False) assert train_dataset.class_to_idx == val_dataset.class_to_idx == test_dataset.class_to_idx, "Mapping not correct" model = get_model(opts) opts.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if torch.cuda.device_count() > 1 and not opts.no_data_parallel: model = nn.DataParallel(model) model = model.to(opts.device) optimizer = optim.RMSprop(model.parameters(), lr=opts.lr, alpha=0.9, weight_decay=1e-5, momentum=0.9) scheduler = get_lr_scheduler(optimizer, opts) best_val_loss = float('inf') best_val_accu = float(0) best_val_rec = float(0) best_val_prec = float(0) best_val_f1 = float(0) best_val_auc = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, optimizer, opts) if epoch == opts.finetune_epoch and opts.train_mode == 'pretrain_and_finetune': train_loader = finetune_loader optimizer = optim.RMSprop(model.parameters(), lr=opts.lr, alpha=0.9, weight_decay=1e-5, momentum=0.9) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=opts.step_size_finetuning, gamma=opts.gamma) # Run the validation set with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, opts) ############################## # Write to summary writer # ############################## train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] train_rec, val_rec = train_metric['recalls'], val_metric['recalls'] train_prec, val_prec = train_metric['precisions'], val_metric[ 'precisions'] train_f1, val_f1 = train_metric['f1'], val_metric['f1'] train_auc, val_auc = train_metric['auc'], val_metric['auc'] writer.add_scalar('Loss/Train', train_loss, epoch) writer.add_scalar('Accuracy/Train', train_acc, epoch) writer.add_scalar('Precision/Train', train_prec, epoch) writer.add_scalar('Recall/Train', train_rec, epoch) writer.add_scalar('F1/Train', train_f1, epoch) writer.add_scalar('AUC/Train', train_auc, epoch) writer.add_scalar('Loss/Val', val_loss, epoch) writer.add_scalar('Accuracy/Val', val_acc, epoch) writer.add_scalar('Precision/Val', val_prec, epoch) writer.add_scalar('Recall/Val', val_rec, epoch) writer.add_scalar('F1/Val', val_f1, epoch) writer.add_scalar('AUC/Val', val_auc, epoch) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler in ['step', 'cosine']: scheduler.step() t_end = time.time() delta = t_end - t_start print_epoch_progress(epoch, opts.epochs, train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) if val_acc > best_val_accu: best_val_accu = val_acc if bool(opts.save_model): torch.save( model.state_dict(), os.path.join(model_dir, opts.run_name, 'best_state_dict.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if val_rec > best_val_rec: best_val_rec = val_rec if val_prec > best_val_prec: best_val_prec = val_prec if val_f1 > best_val_f1: best_val_f1 = val_f1 print(f'The best validation F1-score is now {best_val_f1}') print( f'The validation accuracy and AUC are now {val_acc} and {val_auc}' ) if val_auc > best_val_auc: best_val_auc = val_auc if iteration_change_loss == opts.patience and opts.early_stopping: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print(f'training took {t_end_training - t_start_training}s') print(f'Best validation accuracy: {best_val_accu}') print(f'Best validation loss: {best_val_loss}') print(f'Best validation precision: {best_val_prec}') print(f'Best validation recall: {best_val_rec}') print(f'Best validation f1: {best_val_f1}') print(f'Best validation AUC: {best_val_auc}') with torch.no_grad(): if opts.train_mode in ['combined', 'oversampling']: model.load_state_dict( torch.load( os.path.join(model_dir, opts.run_name, 'best_state_dict.pth'))) test_loss, test_metric = evaluate_model(model, test_loader, opts) print(f'The best test F1: {test_metric["f1"]}') print(f'The best test auc: {test_metric["auc"]}') print(f'The best test accuracy: {test_metric["accuracy"]}')
def main(opts): """Main function for the training pipeline :opts: commandlien arguments :returns: None """ ########################################################################## # Basic settings # ########################################################################## exp_dir = 'experiments' log_dir = os.path.join(exp_dir, 'logs') model_dir = os.path.join(exp_dir, 'models') os.makedirs(os.path.join(model_dir, opts.run_name), exist_ok=True) ########################################################################## # Define all the necessary variables for model training and evaluation # ########################################################################## writer = SummaryWriter(os.path.join(log_dir, opts.run_name), flush_secs=5) train_dataset = get_train_dataset(root=os.path.join('data', 'train')) weights = make_weights_for_balanced_classes(train_dataset.imgs, len(train_dataset.classes)) weights = torch.DoubleTensor(weights) sampler = torch.utils.data.sampler.WeightedRandomSampler( weights, len(weights)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opts.batch_size, num_workers=6, drop_last=False, sampler=sampler) val_dataset = get_val_dataset(root=os.path.join('data', 'val')) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=6, drop_last=False) assert train_dataset.class_to_idx == val_dataset.class_to_idx, "Mapping not correct" model = load_baseline(n_classes=2) if torch.cuda.is_available(): model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=opts.lr, weight_decay=0.1) if opts.lr_scheduler == "plateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=.3, threshold=1e-4, verbose=True) elif opts.lr_scheduler == "step": scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=opts.gamma) best_val_loss = float('inf') best_val_accu = float(0) iteration_change_loss = 0 t_start_training = time.time() ########################################################################## # Main training loop # ########################################################################## for epoch in range(opts.epochs): current_lr = get_lr(optimizer) t_start = time.time() ############################################################ # The actual training and validation step for each epoch # ############################################################ train_loss, train_metric = train_model(model, train_loader, epoch, opts.epochs, optimizer, writer, current_lr, opts.log_every) with torch.no_grad(): val_loss, val_metric = evaluate_model(model, val_loader, epoch, opts.epochs, writer, current_lr) ############################## # Write to summary writer # ############################## writer.add_scalar('Loss/Train', train_loss, epoch) writer.add_scalar('Accuracy/Train', train_metric['accuracy'], epoch) writer.add_scalar('Precision/Train', train_metric['precisions'], epoch) writer.add_scalar('Recall/Train', train_metric['recalls'], epoch) writer.add_scalar('F1/Train', train_metric['f1'], epoch) writer.add_scalar('Loss/Val', val_loss, epoch) writer.add_scalar('Accuracy/Val', val_metric['accuracy'], epoch) writer.add_scalar('Precision/Val', val_metric['precisions'], epoch) writer.add_scalar('Recall/Val', val_metric['recalls'], epoch) writer.add_scalar('F1/Val', val_metric['f1'], epoch) ############################## # Adjust the learning rate # ############################## if opts.lr_scheduler == 'plateau': scheduler.step(val_loss) elif opts.lr_scheduler == 'step': scheduler.step() t_end = time.time() delta = t_end - t_start print_epoch_progress(train_loss, val_loss, delta, train_metric, val_metric) iteration_change_loss += 1 print('-' * 30) train_acc, val_acc = train_metric['accuracy'], val_metric['accuracy'] # file_name = ('val_acc_{}_train_acc_{}_epoch_{}.pth'. # format(train_acc, val_acc, epoch)) # torch.save(model, os.path.join(model_dir, opts.run_name, file_name)) if val_acc > best_val_accu: best_val_accu = val_acc if bool(opts.save_model): torch.save(model, os.path.join(model_dir, opts.run_name, 'best.pth')) if val_loss < best_val_loss: best_val_loss = val_loss iteration_change_loss = 0 if iteration_change_loss == opts.patience and opts.early_stopping: print( ('Early stopping after {0} iterations without the decrease ' + 'of the val loss').format(iteration_change_loss)) break t_end_training = time.time() print('training took {}s'.format(t_end_training - t_start_training))