def copy_and_unpack_data(logger, pids, fold_dir, source_dir, target_dir): start_time = time.time() with open(os.path.join(fold_dir, 'file_list.txt'), 'w') as handle: for pid in pids: handle.write('{}_img.npz\n'.format(pid)) handle.write('{}_rois.npz\n'.format(pid)) subprocess.call('rsync -av --files-from {} {} {}'.format( os.path.join(fold_dir, 'file_list.txt'), source_dir, target_dir), shell=True) n_threads = 8 dutils.unpack_dataset(target_dir, threads=n_threads) copied_files = os.listdir(target_dir) t = utils.get_formatted_duration(time.time() - start_time) logger.info( "\ncopying and unpacking data set finished using {} threads.\n{} files in target dir: {}. Took {}\n" .format(n_threads, len(copied_files), target_dir, t))
net=None, logger=logger, mode='analysis') results_list = predictor.load_saved_predictions( apply_wbc=True) logger.info('starting evaluation...') evaluator = Evaluator(cf, logger, mode='test') evaluator.evaluate_predictions(results_list) evaluator.score_test_df() else: logger.info( "Skipping fold {} since no model parameters found.". format(fold)) # create experiment folder and copy scripts without starting job. # useful for cloud deployment where configs might change before job actually runs. elif args.mode == 'create_exp': cf = utils.prep_exp(args.exp_source, args.exp_dir, args.server_env, use_stored_settings=False) logger = utils.get_logger(cf.exp_dir) logger.info('created experiment directory at {}'.format(cf.exp_dir)) else: raise RuntimeError('mode specified in args is not implemented...') t = utils.get_formatted_duration(time.time() - stime) logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t)) del logger
def train(logger): """ perform the training routine for a given fold. saves plots and selected parameters to the experiment dir specified in the configs. """ logger.info( 'performing training in {}D over fold {} on experiment {} with model {}' .format(cf.dim, cf.fold, cf.exp_dir, cf.model)) net = model.net(cf, logger).cuda() if hasattr(cf, "optimizer") and cf.optimizer.lower() == "adam": logger.info("Using Adam optimizer.") optimizer = torch.optim.Adam(utils.parse_params_for_optim( net, weight_decay=cf.weight_decay, exclude_from_wd=cf.exclude_from_wd), lr=cf.learning_rate[0]) else: logger.info("Using AdamW optimizer.") optimizer = torch.optim.AdamW(utils.parse_params_for_optim( net, weight_decay=cf.weight_decay, exclude_from_wd=cf.exclude_from_wd), lr=cf.learning_rate[0]) if cf.dynamic_lr_scheduling: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor, patience=cf.scheduling_patience) model_selector = utils.ModelSelector(cf, logger) train_evaluator = Evaluator(cf, logger, mode='train') val_evaluator = Evaluator(cf, logger, mode=cf.val_mode) starting_epoch = 1 # prepare monitoring monitor_metrics = utils.prepare_monitoring(cf) if cf.resume: checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint") starting_epoch, net, optimizer, monitor_metrics = \ utils.load_checkpoint(checkpoint_path, net, optimizer) logger.info('resumed from checkpoint {} to epoch {}'.format( checkpoint_path, starting_epoch)) logger.info('loading dataset and initializing batch generators...') batch_gen = data_loader.get_train_generators(cf, logger) # Prepare MLFlow best_loss = 1e3 step = 1 mlflow.log_artifacts(cf.exp_dir, "exp") for epoch in range(starting_epoch, cf.num_epochs + 1): logger.info('starting training epoch {}'.format(epoch)) start_time = time.time() net.train() train_results_list = [] bix = 0 seen_pids = [] while True: bix = bix + 1 try: batch = next(batch_gen['train']) except StopIteration: break for pid in batch['pid']: seen_pids.append(pid) # print(f'\rtr. batch {bix}: {batch["pid"]}') tic_fw = time.time() results_dict = net.train_forward(batch) tic_bw = time.time() optimizer.zero_grad() results_dict['torch_loss'].backward() optimizer.step() print( '\rtr. batch {0} (ep. {1}) fw {2:.2f}s / bw {3:.2f} s / total {4:.2f} s || ' .format(bix + 1, epoch, tic_bw - tic_fw, time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string'], flush=True, end="") train_results_list.append( ({k: v for k, v in results_dict.items() if k != "seg_preds"}, batch["pid"])) print(f"Seen pids (unique): {len(np.unique(seen_pids))}") print() _, monitor_metrics['train'] = train_evaluator.evaluate_predictions( train_results_list, monitor_metrics['train']) logger.info('generating training example plot.') utils.split_off_process( plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join(cf.plot_dir, 'pred_example_{}_train.png'.format(cf.fold))) train_time = time.time() - start_time logger.info('starting validation in mode {}.'.format(cf.val_mode)) with torch.no_grad(): net.eval() if cf.do_validation: val_results_list = [] val_predictor = Predictor(cf, net, logger, mode='val') while True: try: batch = next(batch_gen[cf.val_mode]) except StopIteration: break if cf.val_mode == 'val_patient': results_dict = val_predictor.predict_patient(batch) elif cf.val_mode == 'val_sampling': results_dict = net.train_forward(batch, is_validation=True) val_results_list.append(({ k: v for k, v in results_dict.items() if k != "seg_preds" }, batch["pid"])) _, monitor_metrics['val'] = val_evaluator.evaluate_predictions( val_results_list, monitor_metrics['val']) best_model_path = model_selector.run_model_selection( net, optimizer, monitor_metrics, epoch) # Save best model mlflow.log_artifacts( best_model_path, os.path.join("exp", os.path.basename(cf.fold_dir), 'best_checkpoint')) # Save logs and plots mlflow.log_artifacts(os.path.join(cf.exp_dir, "logs"), os.path.join("exp", 'logs')) mlflow.log_artifacts( cf.plot_dir, os.path.join("exp", os.path.basename(cf.plot_dir))) # update monitoring and prediction plots monitor_metrics.update({ "lr": { str(g): group['lr'] for (g, group) in enumerate(optimizer.param_groups) } }) # replace tboard metrics with MLFlow #logger.metrics2tboard(monitor_metrics, global_step=epoch) mlflow.log_metric('learning rate', optimizer.param_groups[0]['lr'], cf.num_epochs * cf.fold + epoch) for key in ['train', 'val']: for tag, val in monitor_metrics[key].items(): val = val[ -1] # maybe remove list wrapping, recording in evaluator? if 'loss' in tag.lower() and not np.isnan(val): mlflow.log_metric(f'{key}_{tag}', val, cf.num_epochs * cf.fold + epoch) elif not np.isnan(val): mlflow.log_metric(f'{key}_{tag}', val, cf.num_epochs * cf.fold + epoch) epoch_time = time.time() - start_time logger.info('trained epoch {}: took {} ({} train / {} val)'.format( epoch, utils.get_formatted_duration(epoch_time, "ms"), utils.get_formatted_duration(train_time, "ms"), utils.get_formatted_duration(epoch_time - train_time, "ms"))) batch = next(batch_gen['val_sampling']) results_dict = net.train_forward(batch, is_validation=True) logger.info('generating validation-sampling example plot.') utils.split_off_process(plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join( cf.plot_dir, 'pred_example_{}_val.png'.format( cf.fold))) # -------------- scheduling ----------------- if cf.dynamic_lr_scheduling: scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1]) else: for param_group in optimizer.param_groups: param_group['lr'] = cf.learning_rate[epoch - 1] # Save whole experiment to MLFlow mlflow.log_artifacts(cf.exp_dir, "exp")
def train(logger): """ perform the training routine for a given fold. saves plots and selected parameters to the experiment dir specified in the configs. """ logger.info( 'performing training in {}D over fold {} on experiment {} with model {}' .format(cf.dim, cf.fold, cf.exp_dir, cf.model)) net = model.net(cf, logger).cuda() if hasattr(cf, "optimizer") and cf.optimizer.lower() == "adam": logger.info("Using Adam optimizer.") optimizer = torch.optim.Adam(utils.parse_params_for_optim( net, weight_decay=cf.weight_decay, exclude_from_wd=cf.exclude_from_wd), lr=cf.learning_rate[0]) else: logger.info("Using AdamW optimizer.") optimizer = torch.optim.AdamW(utils.parse_params_for_optim( net, weight_decay=cf.weight_decay, exclude_from_wd=cf.exclude_from_wd), lr=cf.learning_rate[0]) if cf.dynamic_lr_scheduling: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode=cf.scheduling_mode, factor=cf.lr_decay_factor, patience=cf.scheduling_patience) model_selector = utils.ModelSelector(cf, logger) train_evaluator = Evaluator(cf, logger, mode='train') val_evaluator = Evaluator(cf, logger, mode=cf.val_mode) starting_epoch = 1 # prepare monitoring monitor_metrics = utils.prepare_monitoring(cf) if cf.resume: checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint") starting_epoch, net, optimizer, monitor_metrics = \ utils.load_checkpoint(checkpoint_path, net, optimizer) logger.info('resumed from checkpoint {} to epoch {}'.format( checkpoint_path, starting_epoch)) logger.info('loading dataset and initializing batch generators...') batch_gen = data_loader.get_train_generators(cf, logger) for epoch in range(starting_epoch, cf.num_epochs + 1): logger.info('starting training epoch {}'.format(epoch)) start_time = time.time() net.train() train_results_list = [] for bix in range(cf.num_train_batches): batch = next(batch_gen['train']) tic_fw = time.time() results_dict = net.train_forward(batch) tic_bw = time.time() optimizer.zero_grad() results_dict['torch_loss'].backward() optimizer.step() print( '\rtr. batch {0}/{1} (ep. {2}) fw {3:.2f}s / bw {4:.2f} s / total {5:.2f} s || ' .format(bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw, time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string'], flush=True, end="") train_results_list.append( ({k: v for k, v in results_dict.items() if k != "seg_preds"}, batch["pid"])) print() _, monitor_metrics['train'] = train_evaluator.evaluate_predictions( train_results_list, monitor_metrics['train']) logger.info('generating training example plot.') utils.split_off_process( plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join(cf.plot_dir, 'pred_example_{}_train.png'.format(cf.fold))) train_time = time.time() - start_time logger.info('starting validation in mode {}.'.format(cf.val_mode)) with torch.no_grad(): net.eval() if cf.do_validation: val_results_list = [] val_predictor = Predictor(cf, net, logger, mode='val') for _ in range(batch_gen['n_val']): batch = next(batch_gen[cf.val_mode]) if cf.val_mode == 'val_patient': results_dict = val_predictor.predict_patient(batch) elif cf.val_mode == 'val_sampling': results_dict = net.train_forward(batch, is_validation=True) #val_results_list.append([results_dict['boxes'], batch['pid']]) val_results_list.append(({ k: v for k, v in results_dict.items() if k != "seg_preds" }, batch["pid"])) _, monitor_metrics['val'] = val_evaluator.evaluate_predictions( val_results_list, monitor_metrics['val']) model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch) # update monitoring and prediction plots monitor_metrics.update({ "lr": { str(g): group['lr'] for (g, group) in enumerate(optimizer.param_groups) } }) logger.metrics2tboard(monitor_metrics, global_step=epoch) epoch_time = time.time() - start_time logger.info('trained epoch {}: took {} ({} train / {} val)'.format( epoch, utils.get_formatted_duration(epoch_time, "ms"), utils.get_formatted_duration(train_time, "ms"), utils.get_formatted_duration(epoch_time - train_time, "ms"))) batch = next(batch_gen['val_sampling']) results_dict = net.train_forward(batch, is_validation=True) logger.info('generating validation-sampling example plot.') utils.split_off_process(plot_batch_prediction, batch, results_dict, cf, outfile=os.path.join( cf.plot_dir, 'pred_example_{}_val.png'.format( cf.fold))) # -------------- scheduling ----------------- if cf.dynamic_lr_scheduling: scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1]) else: for param_group in optimizer.param_groups: param_group['lr'] = cf.learning_rate[epoch - 1]