Example #1
0
def copy_and_unpack_data(logger, pids, fold_dir, source_dir, target_dir):

    start_time = time.time()
    with open(os.path.join(fold_dir, 'file_list.txt'), 'w') as handle:
        for pid in pids:
            handle.write('{}_img.npz\n'.format(pid))
            handle.write('{}_rois.npz\n'.format(pid))

    subprocess.call('rsync -av --files-from {} {} {}'.format(
        os.path.join(fold_dir, 'file_list.txt'), source_dir, target_dir),
                    shell=True)
    n_threads = 8
    dutils.unpack_dataset(target_dir, threads=n_threads)
    copied_files = os.listdir(target_dir)
    t = utils.get_formatted_duration(time.time() - start_time)
    logger.info(
        "\ncopying and unpacking data set finished using {} threads.\n{} files in target dir: {}. Took {}\n"
        .format(n_threads, len(copied_files), target_dir, t))
Example #2
0
                                          net=None,
                                          logger=logger,
                                          mode='analysis')
                    results_list = predictor.load_saved_predictions(
                        apply_wbc=True)
                    logger.info('starting evaluation...')
                    evaluator = Evaluator(cf, logger, mode='test')
                    evaluator.evaluate_predictions(results_list)
                    evaluator.score_test_df()
                else:
                    logger.info(
                        "Skipping fold {} since no model parameters found.".
                        format(fold))

    # create experiment folder and copy scripts without starting job.
    # useful for cloud deployment where configs might change before job actually runs.
    elif args.mode == 'create_exp':
        cf = utils.prep_exp(args.exp_source,
                            args.exp_dir,
                            args.server_env,
                            use_stored_settings=False)
        logger = utils.get_logger(cf.exp_dir)
        logger.info('created experiment directory at {}'.format(cf.exp_dir))

    else:
        raise RuntimeError('mode specified in args is not implemented...')

    t = utils.get_formatted_duration(time.time() - stime)
    logger.info("{} total runtime: {}".format(os.path.split(__file__)[1], t))
    del logger
Example #3
0
def train(logger):
    """
    perform the training routine for a given fold. saves plots and selected parameters to the experiment dir
    specified in the configs.
    """
    logger.info(
        'performing training in {}D over fold {} on experiment {} with model {}'
        .format(cf.dim, cf.fold, cf.exp_dir, cf.model))

    net = model.net(cf, logger).cuda()
    if hasattr(cf, "optimizer") and cf.optimizer.lower() == "adam":
        logger.info("Using Adam optimizer.")
        optimizer = torch.optim.Adam(utils.parse_params_for_optim(
            net,
            weight_decay=cf.weight_decay,
            exclude_from_wd=cf.exclude_from_wd),
                                     lr=cf.learning_rate[0])
    else:
        logger.info("Using AdamW optimizer.")
        optimizer = torch.optim.AdamW(utils.parse_params_for_optim(
            net,
            weight_decay=cf.weight_decay,
            exclude_from_wd=cf.exclude_from_wd),
                                      lr=cf.learning_rate[0])

    if cf.dynamic_lr_scheduling:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode=cf.scheduling_mode,
            factor=cf.lr_decay_factor,
            patience=cf.scheduling_patience)

    model_selector = utils.ModelSelector(cf, logger)
    train_evaluator = Evaluator(cf, logger, mode='train')
    val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)

    starting_epoch = 1

    # prepare monitoring
    monitor_metrics = utils.prepare_monitoring(cf)

    if cf.resume:
        checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint")
        starting_epoch, net, optimizer, monitor_metrics = \
            utils.load_checkpoint(checkpoint_path, net, optimizer)
        logger.info('resumed from checkpoint {} to epoch {}'.format(
            checkpoint_path, starting_epoch))

    logger.info('loading dataset and initializing batch generators...')
    batch_gen = data_loader.get_train_generators(cf, logger)

    # Prepare MLFlow
    best_loss = 1e3
    step = 1
    mlflow.log_artifacts(cf.exp_dir, "exp")

    for epoch in range(starting_epoch, cf.num_epochs + 1):

        logger.info('starting training epoch {}'.format(epoch))
        start_time = time.time()

        net.train()
        train_results_list = []
        bix = 0
        seen_pids = []
        while True:
            bix = bix + 1
            try:
                batch = next(batch_gen['train'])
            except StopIteration:
                break
            for pid in batch['pid']:
                seen_pids.append(pid)
            # print(f'\rtr. batch {bix}: {batch["pid"]}')
            tic_fw = time.time()
            results_dict = net.train_forward(batch)
            tic_bw = time.time()
            optimizer.zero_grad()
            results_dict['torch_loss'].backward()
            optimizer.step()
            print(
                '\rtr. batch {0} (ep. {1}) fw {2:.2f}s / bw {3:.2f} s / total {4:.2f} s || '
                .format(bix + 1, epoch, tic_bw - tic_fw,
                        time.time() - tic_bw,
                        time.time() - tic_fw) + results_dict['logger_string'],
                flush=True,
                end="")
            train_results_list.append(
                ({k: v
                  for k, v in results_dict.items()
                  if k != "seg_preds"}, batch["pid"]))
        print(f"Seen pids (unique): {len(np.unique(seen_pids))}")
        print()

        _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(
            train_results_list, monitor_metrics['train'])

        logger.info('generating training example plot.')
        utils.split_off_process(
            plot_batch_prediction,
            batch,
            results_dict,
            cf,
            outfile=os.path.join(cf.plot_dir,
                                 'pred_example_{}_train.png'.format(cf.fold)))

        train_time = time.time() - start_time

        logger.info('starting validation in mode {}.'.format(cf.val_mode))
        with torch.no_grad():
            net.eval()
            if cf.do_validation:
                val_results_list = []
                val_predictor = Predictor(cf, net, logger, mode='val')
                while True:
                    try:
                        batch = next(batch_gen[cf.val_mode])
                    except StopIteration:
                        break
                    if cf.val_mode == 'val_patient':
                        results_dict = val_predictor.predict_patient(batch)
                    elif cf.val_mode == 'val_sampling':
                        results_dict = net.train_forward(batch,
                                                         is_validation=True)
                    val_results_list.append(({
                        k: v
                        for k, v in results_dict.items() if k != "seg_preds"
                    }, batch["pid"]))

                _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(
                    val_results_list, monitor_metrics['val'])
                best_model_path = model_selector.run_model_selection(
                    net, optimizer, monitor_metrics, epoch)
                # Save best model
                mlflow.log_artifacts(
                    best_model_path,
                    os.path.join("exp", os.path.basename(cf.fold_dir),
                                 'best_checkpoint'))

            # Save logs and plots
            mlflow.log_artifacts(os.path.join(cf.exp_dir, "logs"),
                                 os.path.join("exp", 'logs'))
            mlflow.log_artifacts(
                cf.plot_dir, os.path.join("exp",
                                          os.path.basename(cf.plot_dir)))

            # update monitoring and prediction plots
            monitor_metrics.update({
                "lr": {
                    str(g): group['lr']
                    for (g, group) in enumerate(optimizer.param_groups)
                }
            })

            # replace tboard metrics with MLFlow
            #logger.metrics2tboard(monitor_metrics, global_step=epoch)
            mlflow.log_metric('learning rate', optimizer.param_groups[0]['lr'],
                              cf.num_epochs * cf.fold + epoch)
            for key in ['train', 'val']:
                for tag, val in monitor_metrics[key].items():
                    val = val[
                        -1]  # maybe remove list wrapping, recording in evaluator?
                    if 'loss' in tag.lower() and not np.isnan(val):
                        mlflow.log_metric(f'{key}_{tag}', val,
                                          cf.num_epochs * cf.fold + epoch)
                    elif not np.isnan(val):
                        mlflow.log_metric(f'{key}_{tag}', val,
                                          cf.num_epochs * cf.fold + epoch)

            epoch_time = time.time() - start_time
            logger.info('trained epoch {}: took {} ({} train / {} val)'.format(
                epoch, utils.get_formatted_duration(epoch_time, "ms"),
                utils.get_formatted_duration(train_time, "ms"),
                utils.get_formatted_duration(epoch_time - train_time, "ms")))
            batch = next(batch_gen['val_sampling'])
            results_dict = net.train_forward(batch, is_validation=True)
            logger.info('generating validation-sampling example plot.')
            utils.split_off_process(plot_batch_prediction,
                                    batch,
                                    results_dict,
                                    cf,
                                    outfile=os.path.join(
                                        cf.plot_dir,
                                        'pred_example_{}_val.png'.format(
                                            cf.fold)))

        # -------------- scheduling -----------------
        if cf.dynamic_lr_scheduling:
            scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
        else:
            for param_group in optimizer.param_groups:
                param_group['lr'] = cf.learning_rate[epoch - 1]
    # Save whole experiment to MLFlow
    mlflow.log_artifacts(cf.exp_dir, "exp")
Example #4
0
def train(logger):
    """
    perform the training routine for a given fold. saves plots and selected parameters to the experiment dir
    specified in the configs.
    """
    logger.info(
        'performing training in {}D over fold {} on experiment {} with model {}'
        .format(cf.dim, cf.fold, cf.exp_dir, cf.model))

    net = model.net(cf, logger).cuda()
    if hasattr(cf, "optimizer") and cf.optimizer.lower() == "adam":
        logger.info("Using Adam optimizer.")
        optimizer = torch.optim.Adam(utils.parse_params_for_optim(
            net,
            weight_decay=cf.weight_decay,
            exclude_from_wd=cf.exclude_from_wd),
                                     lr=cf.learning_rate[0])
    else:
        logger.info("Using AdamW optimizer.")
        optimizer = torch.optim.AdamW(utils.parse_params_for_optim(
            net,
            weight_decay=cf.weight_decay,
            exclude_from_wd=cf.exclude_from_wd),
                                      lr=cf.learning_rate[0])

    if cf.dynamic_lr_scheduling:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode=cf.scheduling_mode,
            factor=cf.lr_decay_factor,
            patience=cf.scheduling_patience)

    model_selector = utils.ModelSelector(cf, logger)
    train_evaluator = Evaluator(cf, logger, mode='train')
    val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)

    starting_epoch = 1

    # prepare monitoring
    monitor_metrics = utils.prepare_monitoring(cf)

    if cf.resume:
        checkpoint_path = os.path.join(cf.fold_dir, "last_checkpoint")
        starting_epoch, net, optimizer, monitor_metrics = \
            utils.load_checkpoint(checkpoint_path, net, optimizer)
        logger.info('resumed from checkpoint {} to epoch {}'.format(
            checkpoint_path, starting_epoch))

    logger.info('loading dataset and initializing batch generators...')
    batch_gen = data_loader.get_train_generators(cf, logger)

    for epoch in range(starting_epoch, cf.num_epochs + 1):

        logger.info('starting training epoch {}'.format(epoch))
        start_time = time.time()

        net.train()
        train_results_list = []
        for bix in range(cf.num_train_batches):
            batch = next(batch_gen['train'])
            tic_fw = time.time()
            results_dict = net.train_forward(batch)
            tic_bw = time.time()
            optimizer.zero_grad()
            results_dict['torch_loss'].backward()
            optimizer.step()
            print(
                '\rtr. batch {0}/{1} (ep. {2}) fw {3:.2f}s / bw {4:.2f} s / total {5:.2f} s || '
                .format(bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw,
                        time.time() - tic_bw,
                        time.time() - tic_fw) + results_dict['logger_string'],
                flush=True,
                end="")
            train_results_list.append(
                ({k: v
                  for k, v in results_dict.items()
                  if k != "seg_preds"}, batch["pid"]))
        print()

        _, monitor_metrics['train'] = train_evaluator.evaluate_predictions(
            train_results_list, monitor_metrics['train'])

        logger.info('generating training example plot.')
        utils.split_off_process(
            plot_batch_prediction,
            batch,
            results_dict,
            cf,
            outfile=os.path.join(cf.plot_dir,
                                 'pred_example_{}_train.png'.format(cf.fold)))

        train_time = time.time() - start_time

        logger.info('starting validation in mode {}.'.format(cf.val_mode))
        with torch.no_grad():
            net.eval()
            if cf.do_validation:
                val_results_list = []
                val_predictor = Predictor(cf, net, logger, mode='val')
                for _ in range(batch_gen['n_val']):
                    batch = next(batch_gen[cf.val_mode])
                    if cf.val_mode == 'val_patient':
                        results_dict = val_predictor.predict_patient(batch)
                    elif cf.val_mode == 'val_sampling':
                        results_dict = net.train_forward(batch,
                                                         is_validation=True)
                    #val_results_list.append([results_dict['boxes'], batch['pid']])
                    val_results_list.append(({
                        k: v
                        for k, v in results_dict.items() if k != "seg_preds"
                    }, batch["pid"]))

                _, monitor_metrics['val'] = val_evaluator.evaluate_predictions(
                    val_results_list, monitor_metrics['val'])
                model_selector.run_model_selection(net, optimizer,
                                                   monitor_metrics, epoch)

            # update monitoring and prediction plots
            monitor_metrics.update({
                "lr": {
                    str(g): group['lr']
                    for (g, group) in enumerate(optimizer.param_groups)
                }
            })
            logger.metrics2tboard(monitor_metrics, global_step=epoch)

            epoch_time = time.time() - start_time
            logger.info('trained epoch {}: took {} ({} train / {} val)'.format(
                epoch, utils.get_formatted_duration(epoch_time, "ms"),
                utils.get_formatted_duration(train_time, "ms"),
                utils.get_formatted_duration(epoch_time - train_time, "ms")))
            batch = next(batch_gen['val_sampling'])
            results_dict = net.train_forward(batch, is_validation=True)
            logger.info('generating validation-sampling example plot.')
            utils.split_off_process(plot_batch_prediction,
                                    batch,
                                    results_dict,
                                    cf,
                                    outfile=os.path.join(
                                        cf.plot_dir,
                                        'pred_example_{}_val.png'.format(
                                            cf.fold)))

        # -------------- scheduling -----------------
        if cf.dynamic_lr_scheduling:
            scheduler.step(monitor_metrics["val"][cf.scheduling_criterion][-1])
        else:
            for param_group in optimizer.param_groups:
                param_group['lr'] = cf.learning_rate[epoch - 1]