Esempio n. 1
0
def consolidate_folds(output_folder_base,
                      validation_folder_name: str = 'validation_raw',
                      advanced_postprocessing: bool = False,
                      folds: Tuple[int] = (0, 1, 2, 3, 4)):
    """
    Used to determine the postprocessing for an experiment after all five folds have been completed. In the validation of
    each fold, the postprocessing can only be determined on the cases within that fold. This can result in different
    postprocessing decisions for different folds. In the end, we can only decide for one postprocessing per experiment,
    so we have to rerun it
    :param folds:
    :param advanced_postprocessing:
    :param output_folder_base:experiment output folder (fold_0, fold_1, etc must be subfolders of the given folder)
    :param validation_folder_name: dont use this
    :return:
    """
    output_folder_raw = output_folder_base + "/" + "cv_niftis_raw"
    if isdir(output_folder_raw):
        shutil.rmtree(output_folder_raw)

    output_folder_gt = output_folder_base + "/" + "gt_niftis"
    collect_cv_niftis(output_folder_base, output_folder_raw,
                      validation_folder_name, folds)

    num_niftis_gt = len(
        subfiles(output_folder_base + "/" + "gt_niftis", suffix='.nii.gz'))
    # count niftis in there
    num_niftis = len(subfiles(output_folder_raw, suffix='.nii.gz'))
    if num_niftis != num_niftis_gt:
        raise AssertionError(
            "If does not seem like you trained all the folds! Train all folds first!"
        )

    # load a summary file so that we can know what class labels to expect
    summary_fold0 = load_json(output_folder_base + "/" + "fold_0" + "/" +
                              validation_folder_name + "/" +
                              "summary.json")['results']['mean']
    classes = [int(i) for i in summary_fold0.keys()]
    niftis = subfiles(output_folder_raw, join=False, suffix=".nii.gz")
    test_pred_pairs = [(output_folder_gt + "/" + i,
                        output_folder_raw + "/" + i) for i in niftis]

    # determine_postprocessing needs a summary.json file in the folder where the raw predictions are. We could compute
    # that from the summary files of the five folds but I am feeling lazy today
    aggregate_scores(test_pred_pairs,
                     labels=classes,
                     json_output_file=output_folder_raw + "/" + "summary.json",
                     num_threads=default_num_threads)

    determine_postprocessing(output_folder_base,
                             output_folder_gt,
                             'cv_niftis_raw',
                             final_subf_name="cv_niftis_postprocessed",
                             processes=default_num_threads,
                             advanced_postprocessing=advanced_postprocessing)
Esempio n. 2
0
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 segmentation_export_kwargs: dict = None,
                 run_postprocessing_on_folds: bool = True):
        if isinstance(self.network, DDP):
            net = self.network.module
        else:
            net = self.network
        ds = net.do_ds
        net.do_ds = False

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        if segmentation_export_kwargs is None:
            if 'segmentation_export_params' in self.plans.keys():
                force_separate_z = self.plans['segmentation_export_params'][
                    'force_separate_z']
                interpolation_order = self.plans['segmentation_export_params'][
                    'interpolation_order']
                interpolation_order_z = self.plans[
                    'segmentation_export_params']['interpolation_order_z']
            else:
                force_separate_z = None
                interpolation_order = 1
                interpolation_order_z = 0
        else:
            force_separate_z = segmentation_export_kwargs['force_separate_z']
            interpolation_order = segmentation_export_kwargs[
                'interpolation_order']
            interpolation_order_z = segmentation_export_kwargs[
                'interpolation_order_z']

        # predictions as they come from the network go here
        output_folder = self.output_folder + "/" + validation_folder_name
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)
        # this is for debug purposes
        my_input_args = {
            'do_mirroring': do_mirroring,
            'use_sliding_window': use_sliding_window,
            'step_size': step_size,
            'save_softmax': save_softmax,
            'use_gaussian': use_gaussian,
            'overwrite': overwrite,
            'validation_folder_name': validation_folder_name,
            'debug': debug,
            'all_in_gpu': all_in_gpu,
            'segmentation_export_kwargs': segmentation_export_kwargs,
        }
        save_json(my_input_args, output_folder + "/" + "validation_args.json")

        if do_mirroring:
            if not self.data_aug_params['do_mirror']:
                raise RuntimeError(
                    "We did not train with mirroring so you cannot do inference with mirroring enabled"
                )
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(default_num_threads)
        results = []

        all_keys = list(self.dataset_val.keys())
        my_keys = all_keys[self.local_rank::dist.get_world_size()]
        # we cannot simply iterate over all_keys because we need to know pred_gt_tuples and valid_labels of all cases
        # for evaluation (which is done by local rank 0)
        for k in my_keys:
            properties = load_pickle(self.dataset[k]['properties_file'])
            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]
            pred_gt_tuples.append([
                output_folder + "/" + fname + ".nii.gz",
                self.gt_niftis_folder + "/" + fname + ".nii.gz"
            ])
            if k in my_keys:
                if overwrite or (not isfile( output_folder+"/"+ fname + ".nii.gz"))  or \
                        (save_softmax and not isfile( output_folder+"/"+ fname + ".npz"  )):
                    data = np.load(self.dataset[k]['data_file'])['data']

                    print(k, data.shape)
                    data[-1][data[-1] == -1] = 0

                    softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                        data[:-1],
                        do_mirroring=do_mirroring,
                        mirror_axes=mirror_axes,
                        use_sliding_window=use_sliding_window,
                        step_size=step_size,
                        use_gaussian=use_gaussian,
                        all_in_gpu=all_in_gpu,
                        mixed_precision=self.fp16)[1]

                    softmax_pred = softmax_pred.transpose(
                        [0] + [i + 1 for i in self.transpose_backward])

                    if save_softmax:
                        softmax_fname = output_folder + "/" + fname + ".npz"
                    else:
                        softmax_fname = None
                    """There is a problem with python process communication that prevents us from communicating obejcts
                    larger than 2 GB between processes (basically when the length of the pickle string that will be sent is
                    communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long
                    enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually
                    patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will
                    then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either
                    filename or np.ndarray and will handle this automatically"""
                    if np.prod(softmax_pred.shape) > (
                            2e9 / 4 * 0.85):  # *0.85 just to be save
                        np.save(output_folder + "/" + fname + ".npy",
                                softmax_pred)
                        softmax_pred = output_folder + "/" + fname + ".npy"

                    results.append(
                        export_pool.starmap_async(
                            save_segmentation_nifti_from_softmax,
                            ((softmax_pred, output_folder + "/" + fname +
                              ".nii.gz", properties, interpolation_order,
                              self.regions_class_order, None, None,
                              softmax_fname, None, force_separate_z,
                              interpolation_order_z), )))

        _ = [i.get() for i in results]
        self.print_to_log_file("finished prediction")

        distributed.barrier()

        if self.local_rank == 0:
            # evaluate raw predictions
            self.print_to_log_file("evaluation of raw predictions")
            task = self.dataset_directory.split("/")[-1]
            job_name = self.experiment_name
            _ = aggregate_scores(pred_gt_tuples,
                                 labels=list(range(self.num_classes)),
                                 json_output_file=output_folder + "/" +
                                 "summary.json",
                                 json_name=job_name + " val tiled %s" %
                                 (str(use_sliding_window)),
                                 json_author="Fabian",
                                 json_task=task,
                                 num_threads=default_num_threads)

            if run_postprocessing_on_folds:
                # in the old tuframework we would stop here. Now we add a postprocessing. This postprocessing can remove everything
                # except the largest connected component for each class. To see if this improves results, we do this for all
                # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
                # have this applied during inference as well
                self.print_to_log_file("determining postprocessing")
                determine_postprocessing(
                    self.output_folder,
                    self.gt_niftis_folder,
                    validation_folder_name,
                    final_subf_name=validation_folder_name + "_postprocessed",
                    debug=debug)
                # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
                # They are always in that folder, even if no postprocessing as applied!

            # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
            # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
            # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
            # be used later
            gt_nifti_folder = self.output_folder_base + "/" + "gt_niftis"
            if not os.path.isdir(gt_nifti_folder):
                os.makedirs(gt_nifti_folder)
            for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
                success = False
                attempts = 0
                e = None
                while not success and attempts < 10:
                    try:
                        shutil.copy(f, gt_nifti_folder)
                        success = True
                    except OSError as e:
                        attempts += 1
                        sleep(1)
                if not success:
                    print("Could not copy gt nifti file %s into folder %s" %
                          (f, gt_nifti_folder))
                    if e is not None:
                        raise e

        self.network.train(current_mode)
        net.do_ds = ds
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 segmentation_export_kwargs: dict = None,
                 run_postprocessing_on_folds: bool = True):

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        if segmentation_export_kwargs is None:
            if 'segmentation_export_params' in self.plans.keys():
                force_separate_z = self.plans['segmentation_export_params'][
                    'force_separate_z']
                interpolation_order = self.plans['segmentation_export_params'][
                    'interpolation_order']
                interpolation_order_z = self.plans[
                    'segmentation_export_params']['interpolation_order_z']
            else:
                force_separate_z = None
                interpolation_order = 1
                interpolation_order_z = 0
        else:
            force_separate_z = segmentation_export_kwargs['force_separate_z']
            interpolation_order = segmentation_export_kwargs[
                'interpolation_order']
            interpolation_order_z = segmentation_export_kwargs[
                'interpolation_order_z']

        output_folder = self.output_folder + "/" + validation_folder_name
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(2)
        results = []

        transpose_backward = self.plans.get('transpose_backward')

        for k in self.dataset_val.keys():
            properties = load_pickle(self.dataset[k]['properties_file'])
            data = np.load(self.dataset[k]['data_file'])['data']

            # concat segmentation of previous step
            seg_from_prev_stage = np.load(
                self.folder_with_segs_from_prev_stage + "/" + k +
                "_segFromPrevStage.npz")['data'][None]

            print(data.shape)
            data[-1][data[-1] == -1] = 0
            data_for_net = np.concatenate(
                (data[:-1],
                 to_one_hot(seg_from_prev_stage[0], range(1,
                                                          self.num_classes))))

            softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                data_for_net,
                do_mirroring=do_mirroring,
                mirror_axes=mirror_axes,
                use_sliding_window=use_sliding_window,
                step_size=step_size,
                use_gaussian=use_gaussian,
                all_in_gpu=all_in_gpu,
                mixed_precision=self.fp16)[1]

            if transpose_backward is not None:
                transpose_backward = self.plans.get('transpose_backward')
                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in transpose_backward])

            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]

            if save_softmax:
                softmax_fname = output_folder + "/" + fname + ".npz"
            else:
                softmax_fname = None
            """There is a problem with python process communication that prevents us from communicating obejcts 
            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
            filename or np.ndarray and will handle this automatically"""
            if np.prod(softmax_pred.shape) > (2e9 / 4 *
                                              0.85):  # *0.85 just to be save
                np.save(fname + ".npy", softmax_pred)
                softmax_pred = fname + ".npy"

            results.append(
                export_pool.starmap_async(
                    save_segmentation_nifti_from_softmax,
                    ((softmax_pred, output_folder + "/" + fname + ".nii.gz",
                      properties, interpolation_order,
                      self.regions_class_order, None, None, softmax_fname,
                      None, force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                output_folder + "/" + fname + ".nii.gz",
                self.gt_niftis_folder + "/" + fname + ".nii.gz"
            ])

        _ = [i.get() for i in results]

        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(pred_gt_tuples,
                             labels=list(range(self.num_classes)),
                             json_output_file=output_folder + "/" +
                             "summary.json",
                             json_name=job_name,
                             json_author="Fabian",
                             json_description="",
                             json_task=task)

        if run_postprocessing_on_folds:
            # in the old tuframework we would stop here. Now we add a postprocessing. This postprocessing can remove everything
            # except the largest connected component for each class. To see if this improves results, we do this for all
            # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
            # have this applied during inference as well
            self.print_to_log_file("determining postprocessing")
            determine_postprocessing(self.output_folder,
                                     self.gt_niftis_folder,
                                     validation_folder_name,
                                     final_subf_name=validation_folder_name +
                                     "_postprocessed",
                                     debug=debug)
            # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
            # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = self.output_folder_base + "/" + "gt_niftis"
        if not os.path.isdir(gt_nifti_folder):
            os.makedirs(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError:
                    attempts += 1
                    sleep(1)

        self.network.train(current_mode)
        export_pool.close()
        export_pool.join()
Esempio n. 4
0
def ensemble(training_output_folder1,
             training_output_folder2,
             output_folder,
             task,
             validation_folder,
             folds,
             allow_ensembling: bool = True):
    print("\nEnsembling folders\n", training_output_folder1, "\n",
          training_output_folder2)

    output_folder_base = output_folder
    output_folder = output_folder_base + "/" + "ensembled_raw"

    # only_keep_largest_connected_component is the same for all stages
    dataset_directory = preprocessing_output_dir + "/" + task
    plans = load_pickle(training_output_folder1 + "/" +
                        "plans.pkl")  # we need this only for the labels

    files1 = []
    files2 = []
    property_files = []
    out_files = []
    gt_segmentations = []

    folder_with_gt_segs = dataset_directory + "/" + "gt_segmentations"
    # in the correct shape and we need the original geometry to restore the niftis

    for f in folds:
        validation_folder_net1 = training_output_folder1 + "/" + "fold_%d" % f + "/" + validation_folder
        validation_folder_net2 = training_output_folder2 + "/" + "fold_%d" % f + "/" + validation_folder

        if not isdir(validation_folder_net1):
            raise AssertionError(
                "Validation directory missing: %s. Please rerun validation with `tuframework_train CONFIG TRAINER TASK FOLD -val --npz`"
                % validation_folder_net1)
        if not isdir(validation_folder_net2):
            raise AssertionError(
                "Validation directory missing: %s. Please rerun validation with `tuframework_train CONFIG TRAINER TASK FOLD -val --npz`"
                % validation_folder_net2)

        # we need to ensure the validation was successful. We can verify this via the presence of the summary.json file
        if not isfile(validation_folder_net1 + "/" + 'summary.json'):
            raise AssertionError(
                "Validation directory incomplete: %s. Please rerun validation with `tuframework_train CONFIG TRAINER TASK FOLD -val --npz`"
                % validation_folder_net1)
        if not isfile(validation_folder_net2 + "/" + 'summary.json'):
            raise AssertionError(
                "Validation directory missing: %s. Please rerun validation with `tuframework_train CONFIG TRAINER TASK FOLD -val --npz`"
                % validation_folder_net2)

        patient_identifiers1_npz = [
            i[:-4]
            for i in subfiles(validation_folder_net1, False, None, 'npz', True)
        ]
        patient_identifiers2_npz = [
            i[:-4]
            for i in subfiles(validation_folder_net2, False, None, 'npz', True)
        ]

        # we don't do postprocessing anymore so there should not be any of that noPostProcess
        patient_identifiers1_nii = [
            i[:-7] for i in subfiles(validation_folder_net1,
                                     False,
                                     None,
                                     suffix='nii.gz',
                                     sort=True)
            if not i.endswith("noPostProcess.nii.gz")
            and not i.endswith('_postprocessed.nii.gz')
        ]
        patient_identifiers2_nii = [
            i[:-7] for i in subfiles(validation_folder_net2,
                                     False,
                                     None,
                                     suffix='nii.gz',
                                     sort=True)
            if not i.endswith("noPostProcess.nii.gz")
            and not i.endswith('_postprocessed.nii.gz')
        ]

        if not all(
            [i in patient_identifiers1_npz for i in patient_identifiers1_nii]):
            raise AssertionError(
                "Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag."
                % (validation_folder_net1))
        if not all(
            [i in patient_identifiers2_npz for i in patient_identifiers2_nii]):
            raise AssertionError(
                "Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag."
                % (validation_folder_net2))

        patient_identifiers1_npz.sort()
        patient_identifiers2_npz.sort()

        assert all([
            i == j
            for i, j in zip(patient_identifiers1_npz, patient_identifiers2_npz)
        ]), "npz filenames do not match. This should not happen."

        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)

        for p in patient_identifiers1_npz:
            files1.append(validation_folder_net1 + "/" + p + '.npz')
            files2.append(validation_folder_net2 + "/" + p + '.npz')
            property_files.append(validation_folder_net1 + "/" + p + ".pkl")
            out_files.append(output_folder + "/" + p + ".nii.gz")
            gt_segmentations.append(folder_with_gt_segs + "/" + p + ".nii.gz")

    p = Pool(default_num_threads)
    p.map(merge, zip(files1, files2, property_files, out_files))
    p.close()
    p.join()

    if not isfile(output_folder + "/" + "summary.json") and len(out_files) > 0:
        aggregate_scores(tuple(zip(out_files, gt_segmentations)),
                         labels=plans['all_classes'],
                         json_output_file=output_folder + "/" + "summary.json",
                         json_task=task,
                         json_name=task + "__" +
                         output_folder_base.split("/")[-1],
                         num_threads=default_num_threads)

    if allow_ensembling and not isfile(output_folder_base + "/" +
                                       "postprocessing.json"):
        # now lets also look at postprocessing. We cannot just take what we determined in cross-validation and apply it
        # here because things may have changed and may also be too inconsistent between the two networks
        determine_postprocessing(output_folder_base,
                                 folder_with_gt_segs,
                                 "ensembled_raw",
                                 "temp",
                                 "ensembled_postprocessed",
                                 default_num_threads,
                                 dice_threshold=0)

        out_dir_all_json = network_training_output_dir + "/" + "summary_jsons"
        json_out = load_json(output_folder_base + "/" +
                             "ensembled_postprocessed" + "/" + "summary.json")

        json_out["experiment_name"] = output_folder_base.split("/")[-1]
        save_json(
            json_out, output_folder_base + "/" + "ensembled_postprocessed" +
            "/" + "summary.json")

        if not os.path.isdir(out_dir_all_json):
            os.makedirs(out_dir_all_json)
        shutil.copy(
            output_folder_base + "/" + "ensembled_postprocessed" + "/" +
            "summary.json", out_dir_all_json + "/" + "%s__%s.json" %
            (task, output_folder_base.split("/")[-1]))