コード例 #1
0
    def validate(self,
                 do_mirroring=True,
                 use_train_mode=False,
                 tiled=True,
                 step=2,
                 save_softmax=True,
                 use_gaussian=True,
                 compute_global_dice=True,
                 override=True,
                 validation_folder_name='validation'):
        """
        2018_12_05: I added global accumulation of TP, FP and FN for the validation in here. This is because I believe
        that selecting models is easier when computing the Dice globally instead of independently for each case and
        then averaging over cases. The Lung dataset in particular is very unstable because of the small size of the
        Lung Lesions. My theory is that even though the global Dice is different than the acutal target metric it is
        still a good enough substitute that allows us to get a lot more stable results when rerunning the same
        experiment twice. FYI: computer vision community uses the global jaccard for the evaluation of Cityscapes etc,
        not the per-image jaccard averaged over images.
        The reason I am accumulating TP/FP/FN here and not from the nifti files (which are used by our Evaluator) is
        that all predictions made here will have identical voxel spacing whereas voxel spacings in the nifti files
        will be different (which we could compensate for by using the volume per voxel but that would require the
        evaluator to understand spacings which is does not at this point)

        :param do_mirroring:
        :param use_train_mode:
        :param mirror_axes:
        :param tiled:
        :param tile_in_z:
        :param step:
        :param use_nifti:
        :param save_softmax:
        :param use_gaussian:
        :param use_temporal_models:
        :return:
        """
        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(4)
        results = []
        global_tp = OrderedDict()
        global_fp = OrderedDict()
        global_fn = OrderedDict()

        for k in self.dataset_val.keys():
            print(k)
            properties = self.dataset[k]['properties']
            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]
            if override or (not isfile(join(output_folder,
                                            fname + ".nii.gz"))):
                data = np.load(self.dataset[k]['data_file'])['data']

                print(k, data.shape)
                data[-1][data[-1] == -1] = 0

                softmax_pred = self.predict_preprocessed_data_return_softmax(
                    data[:-1],
                    do_mirroring,
                    1,
                    use_train_mode,
                    1,
                    mirror_axes,
                    tiled,
                    True,
                    step,
                    self.patch_size,
                    use_gaussian=use_gaussian)

                if compute_global_dice:
                    predicted_segmentation = softmax_pred.argmax(0)
                    gt_segmentation = data[-1]
                    labels = properties['classes']
                    labels = [int(i) for i in labels if i > 0]
                    for l in labels:
                        if l not in global_fn.keys():
                            global_fn[l] = 0
                        if l not in global_fp.keys():
                            global_fp[l] = 0
                        if l not in global_tp.keys():
                            global_tp[l] = 0
                        conf = ConfusionMatrix(
                            (predicted_segmentation == l).astype(int),
                            (gt_segmentation == l).astype(int))
                        conf.compute()
                        global_fn[l] += conf.fn
                        global_fp[l] += conf.fp
                        global_tp[l] += conf.tp

                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in self.transpose_backward])

                if save_softmax:
                    softmax_fname = join(output_folder, fname + ".npz")
                else:
                    softmax_fname = None
                """There is a problem with python process communication that prevents us from communicating obejcts 
                larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
                communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
                enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
                patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
                then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
                filename or np.ndarray and will handle this automatically"""
                if np.prod(softmax_pred.shape) > (2e9 / 4 *
                                                  0.9):  # *0.9 just to be save
                    np.save(join(output_folder, fname + ".npy"), softmax_pred)
                    softmax_pred = join(output_folder, fname + ".npy")
                results.append(
                    export_pool.starmap_async(
                        save_segmentation_nifti_from_softmax,
                        ((softmax_pred, join(output_folder,
                                             fname + ".nii.gz"), properties, 1,
                          None, None, None, softmax_fname, None), )))
                # save_segmentation_nifti_from_softmax(softmax_pred, join(output_folder, fname + ".nii.gz"),
                #                                               properties, 3, None, None,
                #                                               None,
                #                                               softmax_fname,
                #                                               None)

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]
        print("finished prediction, now evaluating...")

        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(
            pred_gt_tuples,
            labels=list(range(self.num_classes)),
            json_output_file=join(output_folder, "summary.json"),
            json_name=job_name + " val tiled %s" % (str(tiled)),
            json_author="Fabian",
            json_task=task,
            num_threads=3)
        if compute_global_dice:
            global_dice = OrderedDict()
            all_labels = list(global_fn.keys())
            for l in all_labels:
                global_dice[int(l)] = float(
                    2 * global_tp[l] /
                    (2 * global_tp[l] + global_fn[l] + global_fp[l]))
            write_json(global_dice, join(output_folder, "global_dice.json"))
コード例 #2
0
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 force_separate_z: bool = None,
                 interpolation_order: int = 3,
                 interpolation_order_z: int = 0):
        """
        if debug=True then the temporary files generated for postprocessing determination will be kept
        :return:
        """

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        # predictions as they come from the network go here
        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)
        # this is for debug purposes
        my_input_args = {
            'do_mirroring': do_mirroring,
            'use_sliding_window': use_sliding_window,
            'step_size': step_size,
            'save_softmax': save_softmax,
            'use_gaussian': use_gaussian,
            'overwrite': overwrite,
            'validation_folder_name': validation_folder_name,
            'debug': debug,
            'all_in_gpu': all_in_gpu,
            'force_separate_z': force_separate_z,
            'interpolation_order': interpolation_order,
            'interpolation_order_z': interpolation_order_z,
        }
        save_json(my_input_args, join(output_folder, "validation_args.json"))

        if do_mirroring:
            if not self.data_aug_params['do_mirror']:
                raise RuntimeError(
                    "We did not train with mirroring so you cannot do inference with mirroring enabled"
                )
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(default_num_threads)
        results = []

        for k in self.dataset_val.keys():
            properties = self.dataset[k]['properties']
            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]
            if overwrite or (not isfile(join(output_folder, fname + ".nii.gz"))) or \
                    (save_softmax and not isfile(join(output_folder, fname + ".npz"))):
                data = np.load(self.dataset[k]['data_file'])['data']

                print(k, data.shape)
                data[-1][data[-1] == -1] = 0

                softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                    data[:-1],
                    do_mirroring,
                    mirror_axes,
                    use_sliding_window,
                    step_size,
                    use_gaussian,
                    all_in_gpu=all_in_gpu)[1]

                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in self.transpose_backward])

                if save_softmax:
                    softmax_fname = join(output_folder, fname + ".npz")
                else:
                    softmax_fname = None
                """There is a problem with python process communication that prevents us from communicating obejcts
                larger than 2 GB between processes (basically when the length of the pickle string that will be sent is
                communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long
                enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually
                patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will
                then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either
                filename or np.ndarray and will handle this automatically"""
                if np.prod(softmax_pred.shape) > (
                        2e9 / 4 * 0.85):  # *0.85 just to be save
                    np.save(join(output_folder, fname + ".npy"), softmax_pred)
                    softmax_pred = join(output_folder, fname + ".npy")

                results.append(
                    export_pool.starmap_async(
                        save_segmentation_nifti_from_softmax,
                        ((softmax_pred, join(output_folder,
                                             fname + ".nii.gz"), properties,
                          interpolation_order, None, None, None, softmax_fname,
                          None, force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]
        self.print_to_log_file("finished prediction")

        # evaluate raw predictions
        self.print_to_log_file("evaluation of raw predictions")
        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(
            pred_gt_tuples,
            labels=list(range(self.num_classes)),
            json_output_file=join(output_folder, "summary.json"),
            json_name=job_name + " val tiled %s" % (str(use_sliding_window)),
            json_author="Fabian",
            json_task=task,
            num_threads=default_num_threads)

        # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything
        # except the largest connected component for each class. To see if this improves results, we do this for all
        # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
        # have this applied during inference as well
        self.print_to_log_file("determining postprocessing")
        determine_postprocessing(self.output_folder,
                                 self.gt_niftis_folder,
                                 validation_folder_name,
                                 final_subf_name=validation_folder_name +
                                 "_postprocessed",
                                 debug=debug)
        # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
        # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = join(self.output_folder_base, "gt_niftis")
        maybe_mkdir_p(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            e = None
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError as e:
                    attempts += 1
                    sleep(1)
            if not success:
                print("Could not copy gt nifti file %s into folder %s" %
                      (f, gt_nifti_folder))
                if e is not None:
                    raise e

        self.network.train(current_mode)
コード例 #3
0
def determine_postprocessing(base, gt_labels_folder, raw_subfolder_name="validation_raw",
                             temp_folder="temp",
                             final_subf_name="validation_final", processes=default_num_threads,
                             dice_threshold=0, debug=False,
                             advanced_postprocessing=False,
                             pp_filename="postprocessing.json"):
    """
    :param base:
    :param gt_labels_folder: subfolder of base with niftis of ground truth labels
    :param raw_subfolder_name: subfolder of base with niftis of predicted (non-postprocessed) segmentations
    :param temp_folder: used to store temporary data, will be deleted after we are done here undless debug=True
    :param final_subf_name: final results will be stored here (subfolder of base)
    :param processes:
    :param dice_threshold: only apply postprocessing if results is better than old_result+dice_threshold (can be used as eps)
    :param debug: if True then the temporary files will not be deleted
    :return:
    """
    # lets see what classes are in the dataset
    classes = [int(i) for i in load_json(join(base, raw_subfolder_name, "summary.json"))['results']['mean'].keys() if
               int(i) != 0]

    folder_all_classes_as_fg = join(base, temp_folder + "_allClasses")
    folder_per_class = join(base, temp_folder + "_perClass")

    if isdir(folder_all_classes_as_fg):
        shutil.rmtree(folder_all_classes_as_fg)
    if isdir(folder_per_class):
        shutil.rmtree(folder_per_class)

    # multiprocessing rules
    p = Pool(processes)

    assert isfile(join(base, raw_subfolder_name, "summary.json")), "join(base, raw_subfolder_name) does not " \
                                                                   "contain a summary.json"

    # these are all the files we will be dealing with
    fnames = subfiles(join(base, raw_subfolder_name), suffix=".nii.gz", join=False)

    # make output and temp dir
    maybe_mkdir_p(folder_all_classes_as_fg)
    maybe_mkdir_p(folder_per_class)
    maybe_mkdir_p(join(base, final_subf_name))

    pp_results = {}
    pp_results['dc_per_class_raw'] = {}
    pp_results['dc_per_class_pp_all'] = {}  # dice scores after treating all foreground classes as one
    pp_results['dc_per_class_pp_per_class'] = {}  # dice scores after removing everything except larges cc
    # independently for each class after we already did dc_per_class_pp_all
    pp_results['for_which_classes'] = []
    pp_results['min_valid_object_sizes'] = {}


    validation_result_raw = load_json(join(base, raw_subfolder_name, "summary.json"))['results']
    pp_results['num_samples'] = len(validation_result_raw['all'])
    validation_result_raw = validation_result_raw['mean']

    if advanced_postprocessing:
        # first treat all foreground classes as one and remove all but the largest foreground connected component
        results = []
        for f in fnames:
            predicted_segmentation = join(base, raw_subfolder_name, f)
            # now remove all but the largest connected component for each class
            output_file = join(folder_all_classes_as_fg, f)
            results.append(p.starmap_async(load_remove_save, ((predicted_segmentation, output_file, (classes,)),)))

        results = [i.get() for i in results]

        # aggregate max_size_removed and min_size_kept
        max_size_removed = {}
        min_size_kept = {}
        for tmp in results:
            mx_rem, min_kept = tmp[0]
            for k in mx_rem:
                if mx_rem[k] is not None:
                    if max_size_removed.get(k) is None:
                        max_size_removed[k] = mx_rem[k]
                    else:
                        max_size_removed[k] = max(max_size_removed[k], mx_rem[k])
            for k in min_kept:
                if min_kept[k] is not None:
                    if min_size_kept.get(k) is None:
                        min_size_kept[k] = min_kept[k]
                    else:
                        min_size_kept[k] = min(min_size_kept[k], min_kept[k])

        print("foreground vs background, smallest valid object size was", min_size_kept[tuple(classes)])
        print("removing only objects smaller than that...")

    else:
        min_size_kept = None

    # we need to rerun the step from above, now with the size constraint
    pred_gt_tuples = []
    results = []
    # first treat all foreground classes as one and remove all but the largest foreground connected component
    for f in fnames:
        predicted_segmentation = join(base, raw_subfolder_name, f)
        # now remove all but the largest connected component for each class
        output_file = join(folder_all_classes_as_fg, f)
        results.append(
            p.starmap_async(load_remove_save, ((predicted_segmentation, output_file, (classes,), min_size_kept),)))
        pred_gt_tuples.append([output_file, join(gt_labels_folder, f)])

    _ = [i.get() for i in results]

    # evaluate postprocessed predictions
    _ = aggregate_scores(pred_gt_tuples, labels=classes,
                         json_output_file=join(folder_all_classes_as_fg, "summary.json"),
                         json_author="Fabian", num_threads=processes)

    # now we need to figure out if doing this improved the dice scores. We will implement that defensively in so far
    # that if a single class got worse as a result we won't do this. We can change this in the future but right now I
    # prefer to do it this way
    validation_result_PP_test = load_json(join(folder_all_classes_as_fg, "summary.json"))['results']['mean']

    for c in classes:
        dc_raw = validation_result_raw[str(c)]['Dice']
        dc_pp = validation_result_PP_test[str(c)]['Dice']
        pp_results['dc_per_class_raw'][str(c)] = dc_raw
        pp_results['dc_per_class_pp_all'][str(c)] = dc_pp

    # true if new is better
    do_fg_cc = False
    comp = [pp_results['dc_per_class_pp_all'][str(cl)] > (pp_results['dc_per_class_raw'][str(cl)] + dice_threshold) for
            cl in classes]
    before = np.mean([pp_results['dc_per_class_raw'][str(cl)] for cl in classes])
    after = np.mean([pp_results['dc_per_class_pp_all'][str(cl)] for cl in classes])
    print("Foreground vs background")
    print("before:", before)
    print("after: ", after)
    if any(comp):
        # at least one class improved - yay!
        # now check if another got worse
        # true if new is worse
        any_worse = any(
            [pp_results['dc_per_class_pp_all'][str(cl)] < pp_results['dc_per_class_raw'][str(cl)] for cl in classes])
        if not any_worse:
            pp_results['for_which_classes'].append(classes)
            if min_size_kept is not None:
                pp_results['min_valid_object_sizes'].update(deepcopy(min_size_kept))
            do_fg_cc = True
            print("Removing all but the largest foreground region improved results!")
            print('for_which_classes', classes)
            print('min_valid_object_sizes', min_size_kept)
    else:
        # did not improve things - don't do it
        pass

    if len(classes) > 1:
        # now depending on whether we do remove all but the largest foreground connected component we define the source dir
        # for the next one to be the raw or the temp dir
        if do_fg_cc:
            source = folder_all_classes_as_fg
        else:
            source = join(base, raw_subfolder_name)

        if advanced_postprocessing:
            # now run this for each class separately
            results = []
            for f in fnames:
                predicted_segmentation = join(source, f)
                output_file = join(folder_per_class, f)
                results.append(p.starmap_async(load_remove_save, ((predicted_segmentation, output_file, classes),)))

            results = [i.get() for i in results]

            # aggregate max_size_removed and min_size_kept
            max_size_removed = {}
            min_size_kept = {}
            for tmp in results:
                mx_rem, min_kept = tmp[0]
                for k in mx_rem:
                    if mx_rem[k] is not None:
                        if max_size_removed.get(k) is None:
                            max_size_removed[k] = mx_rem[k]
                        else:
                            max_size_removed[k] = max(max_size_removed[k], mx_rem[k])
                for k in min_kept:
                    if min_kept[k] is not None:
                        if min_size_kept.get(k) is None:
                            min_size_kept[k] = min_kept[k]
                        else:
                            min_size_kept[k] = min(min_size_kept[k], min_kept[k])

            print("classes treated separately, smallest valid object sizes are")
            print(min_size_kept)
            print("removing only objects smaller than that")
        else:
            min_size_kept = None

        # rerun with the size thresholds from above
        pred_gt_tuples = []
        results = []
        for f in fnames:
            predicted_segmentation = join(source, f)
            output_file = join(folder_per_class, f)
            results.append(p.starmap_async(load_remove_save, ((predicted_segmentation, output_file, classes, min_size_kept),)))
            pred_gt_tuples.append([output_file, join(gt_labels_folder, f)])

        _ = [i.get() for i in results]

        # evaluate postprocessed predictions
        _ = aggregate_scores(pred_gt_tuples, labels=classes,
                             json_output_file=join(folder_per_class, "summary.json"),
                             json_author="Fabian", num_threads=processes)

        if do_fg_cc:
            old_res = deepcopy(validation_result_PP_test)
        else:
            old_res = validation_result_raw

        # these are the new dice scores
        validation_result_PP_test = load_json(join(folder_per_class, "summary.json"))['results']['mean']

        for c in classes:
            dc_raw = old_res[str(c)]['Dice']
            dc_pp = validation_result_PP_test[str(c)]['Dice']
            pp_results['dc_per_class_pp_per_class'][str(c)] = dc_pp
            print(c)
            print("before:", dc_raw)
            print("after: ", dc_pp)

            if dc_pp > (dc_raw + dice_threshold):
                pp_results['for_which_classes'].append(int(c))
                if min_size_kept is not None:
                    pp_results['min_valid_object_sizes'].update({c: min_size_kept[c]})
                print("Removing all but the largest region for class %d improved results!" % c)
                print('min_valid_object_sizes', min_size_kept)
    else:
        print("Only one class present, no need to do each class separately as this is covered in fg vs bg")

    if not advanced_postprocessing:
        pp_results['min_valid_object_sizes'] = None

    print("done")
    print("for which classes:")
    print(pp_results['for_which_classes'])
    print("min_object_sizes")
    print(pp_results['min_valid_object_sizes'])

    pp_results['validation_raw'] = raw_subfolder_name
    pp_results['validation_final'] = final_subf_name

    # now that we have a proper for_which_classes, apply that
    pred_gt_tuples = []
    results = []
    for f in fnames:
        predicted_segmentation = join(base, raw_subfolder_name, f)

        # now remove all but the largest connected component for each class
        output_file = join(base, final_subf_name, f)
        results.append(p.starmap_async(load_remove_save, (
            (predicted_segmentation, output_file, pp_results['for_which_classes'],
             pp_results['min_valid_object_sizes']),)))

        pred_gt_tuples.append([output_file,
                               join(gt_labels_folder, f)])

    _ = [i.get() for i in results]
    # evaluate postprocessed predictions
    _ = aggregate_scores(pred_gt_tuples, labels=classes,
                         json_output_file=join(base, final_subf_name, "summary.json"),
                         json_author="Fabian", num_threads=processes)

    pp_results['min_valid_object_sizes'] = str(pp_results['min_valid_object_sizes'])

    save_json(pp_results, join(base, pp_filename))

    # delete temp
    if not debug:
        shutil.rmtree(folder_per_class)
        shutil.rmtree(folder_all_classes_as_fg)

    p.close()
    p.join()
    print("done")
コード例 #4
0
ファイル: ensemble.py プロジェクト: 975150313/nnunet-1
        assert all([isfile(join(validation_folder_net2, i)) for i in all_patient_identifiers])

        maybe_mkdir_p(output_folder)

        for p in all_patient_identifiers:
            files1.append(join(validation_folder_net1, p))
            files2.append(join(validation_folder_net2, p))
            property_files.append(join(folder_where_some_pkl_are, p)[:-3] + "pkl")
            out_files.append(join(output_folder, p[:-4] + ".nii.gz"))
            gt_segmentations.append(join(folder_with_gt_segs, p[:-4] + ".nii.gz"))

    p = Pool(8)
    p.map(merge, zip(files1, files2, property_files, out_files))
    p.close()
    p.join()

    """for args in zip(files1, files2, property_files, out_files, [only_keep_largest_connected_component] * len(files1)):
        print(args[0], args[1])
        merge(args)"""

    if not isfile(join(output_folder, "summary_allFolds.json")) and len(out_files) > 0:
        out_dir_all_json = join(network_training_output_dir, "summary_jsons")
        # now evaluate if all these gt files exist
        aggregate_scores(tuple(zip(out_files, gt_segmentations)), labels=plans['all_classes'],
                         json_output_file=join(output_folder, "summary_allFolds.json"), json_task=task,
                         json_name=task + "__" + output_folder.split("/")[-1], num_threads=4)
        json_out = load_json(join(output_folder, "summary_allFolds.json"))
        json_out["experiment_name"] = output_folder.split("/")[-1]
        save_json(json_out, join(output_folder, "summary_allFolds.json"))
        shutil.copy(join(output_folder, "summary_allFolds.json"), join(out_dir_all_json, "%s__%s.json" % (task, output_folder.split("/")[-1])))
コード例 #5
0
ファイル: ensemble.py プロジェクト: yilinshen/3DSegmentation
def ensemble2(training_output_folder1, training_output_folder2, output_folder,
              task, validation_folder, folds):
    print("\nEnsembling folders\n", training_output_folder1, "\n",
          training_output_folder2)

    output_folder_base = output_folder
    output_folder = join(output_folder_base, "ensembled_raw")

    # only_keep_largest_connected_component is the same for all stages
    dataset_directory = join(preprocessing_output_dir, task)
    plans = load_pickle(join(training_output_folder1,
                             "plans.pkl"))  # we need this only for the labels

    files1 = []
    files2 = []
    property_files = []
    out_files = []
    gt_segmentations = []

    folder_with_gt_segs = join(dataset_directory, "gt_segmentations")
    # in the correct shape and we need the original geometry to restore the niftis

    for f in folds:
        validation_folder_net1 = join(training_output_folder1, "fold_%d" % f,
                                      validation_folder)
        validation_folder_net2 = join(training_output_folder2, "fold_%d" % f,
                                      validation_folder)
        patient_identifiers1 = subfiles(validation_folder_net1, False, None,
                                        'npz', True)
        patient_identifiers2 = subfiles(validation_folder_net2, False, None,
                                        'npz', True)
        # we don't do postprocessing anymore so there should not be any of that noPostProcess
        patient_identifiers1_nii = [
            i for i in subfiles(validation_folder_net1,
                                False,
                                None,
                                suffix='nii.gz',
                                sort=True)
            if not i.endswith("noPostProcess.nii.gz")
            and not i.endswith('_postprocessed.nii.gz')
        ]
        patient_identifiers2_nii = [
            i for i in subfiles(validation_folder_net2,
                                False,
                                None,
                                suffix='nii.gz',
                                sort=True)
            if not i.endswith("noPostProcess.nii.gz")
            and not i.endswith('_postprocessed.nii.gz')
        ]
        assert len(patient_identifiers1) == len(
            patient_identifiers1_nii
        ), "npz seem to be missing. run validation with --npz"
        assert len(patient_identifiers1) == len(
            patient_identifiers1_nii
        ), "npz seem to be missing. run validation with --npz"
        assert all([
            i[:-4] == j[:-7]
            for i, j in zip(patient_identifiers1, patient_identifiers1_nii)
        ]), "npz seem to be missing. run validation with --npz"
        assert all([
            i[:-4] == j[:-7]
            for i, j in zip(patient_identifiers2, patient_identifiers2_nii)
        ]), "npz seem to be missing. run validation with --npz"

        all_patient_identifiers = patient_identifiers1
        for p in patient_identifiers2:
            if p not in all_patient_identifiers:
                all_patient_identifiers.append(p)

        # assert these patients exist for both methods
        assert all([
            isfile(join(validation_folder_net1, i))
            for i in all_patient_identifiers
        ])
        assert all([
            isfile(join(validation_folder_net2, i))
            for i in all_patient_identifiers
        ])

        maybe_mkdir_p(output_folder)

        for p in all_patient_identifiers:
            files1.append(join(validation_folder_net1, p))
            files2.append(join(validation_folder_net2, p))
            property_files.append(join(validation_folder_net1, p)[:-3] + "pkl")
            out_files.append(join(output_folder, p[:-4] + ".nii.gz"))
            gt_segmentations.append(
                join(folder_with_gt_segs, p[:-4] + ".nii.gz"))

    p = Pool(default_num_threads)
    p.map(merge, zip(files1, files2, property_files, out_files))
    p.close()
    p.join()

    if not isfile(join(output_folder, "summary.json")) and len(out_files) > 0:
        aggregate_scores(tuple(zip(out_files, gt_segmentations)),
                         labels=plans['all_classes'],
                         json_output_file=join(output_folder, "summary.json"),
                         json_task=task,
                         json_name=task + "__" +
                         output_folder_base.split("/")[-1],
                         num_threads=default_num_threads)

    if not isfile(join(output_folder_base, "postprocessing.json")):
        # now lets also look at postprocessing. We cannot just take what we determined in cross-validation and apply it
        # here because things may have changed and may also be too inconsistent between the two networks
        determine_postprocessing(output_folder_base,
                                 folder_with_gt_segs,
                                 "ensembled_raw",
                                 "temp",
                                 "ensembled_postprocessed",
                                 default_num_threads,
                                 dice_threshold=0)

        out_dir_all_json = join(network_training_output_dir, "summary_jsons")
        json_out = load_json(
            join(output_folder_base, "ensembled_postprocessed",
                 "summary.json"))

        json_out["experiment_name"] = output_folder_base.split("/")[-1]
        save_json(
            json_out,
            join(output_folder_base, "ensembled_postprocessed",
                 "summary.json"))

        maybe_mkdir_p(out_dir_all_json)
        shutil.copy(
            join(output_folder_base, "ensembled_postprocessed",
                 "summary.json"),
            join(out_dir_all_json,
                 "%s__%s.json" % (task, output_folder_base.split("/")[-1])))
コード例 #6
0
def consolidate_folds(output_folder_base,
                      validation_folder_name='validation_raw',
                      advanced_postprocessing: bool = False):
    """
    Used to determine the postprocessing for an experiment after all five folds have been completed. In the validation of
    each fold, the postprocessing can only be determined on the cases within that fold. This can result in different
    postprocessing decisions for different folds. In the end, we can only decide for one postprocessing per experiment,
    so we have to rerun it
    :param advanced_postprocessing:
    :param output_folder_base:experiment output folder (fold_0, fold_1, etc must be subfolders of the given folder)
    :param validation_folder_name: dont use this
    :return:
    """
    folds = list(range(5))
    folders_folds = [join(output_folder_base, "fold_%d" % i) for i in folds]

    assert all([isdir(i) for i in folders_folds]), "some folds are missing"

    # now for each fold, read the postprocessing json. this will tell us what the name of the validation folder is
    validation_raw_folders = [
        join(output_folder_base, "fold_%d" % i, validation_folder_name)
        for i in folds
    ]

    # count niftis in there
    num_niftis = 0
    for v in validation_raw_folders:
        num_niftis += len(subfiles(v, suffix=".nii.gz"))

    num_niftis_gt = len(subfiles(join(output_folder_base, "gt_niftis")))
    assert num_niftis == num_niftis_gt, "some folds are missing predicted niftis :-(. Make sure you ran all folds properly"

    # now copy all raw niftis into cv_niftis_raw
    output_folder_raw = join(output_folder_base, "cv_niftis_raw")
    maybe_mkdir_p(output_folder_raw)
    for f in folds:
        niftis = subfiles(validation_raw_folders[f], suffix=".nii.gz")
        for n in niftis:
            shutil.copy(n, join(output_folder_raw))

    # load a summary file so that we can know what class labels to expect
    summary_fold0 = load_json(
        join(output_folder_base, "fold_0", validation_folder_name,
             "summary.json"))['results']['mean']
    classes = [int(i) for i in summary_fold0.keys()]
    niftis = subfiles(output_folder_raw, join=False, suffix=".nii.gz")
    test_pred_pairs = [(join(output_folder_base, "gt_niftis",
                             i), join(output_folder_raw, i)) for i in niftis]

    # determine_postprocessing needs a summary.json file in the folder where the raw predictions are. We could compute
    # that from the summary files of the five folds but I am feeling lazy today
    aggregate_scores(test_pred_pairs,
                     labels=classes,
                     json_output_file=join(output_folder_raw, "summary.json"),
                     num_threads=default_num_threads)

    determine_postprocessing(output_folder_base,
                             join(output_folder_base, "gt_niftis"),
                             'cv_niftis_raw',
                             final_subf_name="cv_niftis_postprocessed",
                             processes=default_num_threads,
                             advanced_postprocessing=advanced_postprocessing)
コード例 #7
0
    def validate(self, do_mirroring=True, use_train_mode=False, tiled=True, step=2, save_softmax=True,
                 use_gaussian=True, validation_folder_name='validation'):
        """

        :param do_mirroring:
        :param use_train_mode:
        :param mirror_axes:
        :param tiled:
        :param tile_in_z:
        :param step:
        :param use_nifti:
        :param save_softmax:
        :param use_gaussian:
        :param use_temporal_models:
        :return:
        """
        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        process_manager = Pool(2)
        results = []

        transpose_backward = self.plans.get('transpose_backward')

        for k in self.dataset_val.keys():
            properties = self.dataset[k]['properties']
            data = np.load(self.dataset[k]['data_file'])['data']

            # concat segmentation of previous step
            seg_from_prev_stage = np.load(join(self.folder_with_segs_from_prev_stage,
                                               k + "_segFromPrevStage.npz"))['data'][None]

            print(data.shape)
            data[-1][data[-1] == -1] = 0
            data_for_net = np.concatenate((data[:-1], to_one_hot(seg_from_prev_stage[0], range(1, self.num_classes))))
            softmax_pred = self.predict_preprocessed_data_return_softmax(data_for_net, do_mirroring, 1,
                                                                         use_train_mode, 1, mirror_axes, tiled,
                                                                         True, step, self.patch_size,
                                                                         use_gaussian=use_gaussian)

            if transpose_backward is not None:
                transpose_backward = self.plans.get('transpose_backward')
                softmax_pred = softmax_pred.transpose([0] + [i + 1 for i in transpose_backward])

            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]

            if save_softmax:
                softmax_fname = join(output_folder, fname + ".npz")
            else:
                softmax_fname = None

            """There is a problem with python process communication that prevents us from communicating obejcts 
            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
            filename or np.ndarray and will handle this automatically"""
            if np.prod(softmax_pred.shape) > (2e9 / 4 * 0.85):  # *0.85 just to be save
                np.save(fname + ".npy", softmax_pred)
                softmax_pred = fname + ".npy"
            results.append(process_manager.starmap_async(save_segmentation_nifti_from_softmax,
                                                         ((softmax_pred, join(output_folder, fname + ".nii.gz"),
                                                           properties, 1, None, None, None, softmax_fname, None),
                                                          )
                                                         )
                           )

            pred_gt_tuples.append([join(output_folder, fname + ".nii.gz"),
                                   join(self.gt_niftis_folder, fname + ".nii.gz")])

        _ = [i.get() for i in results]

        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(pred_gt_tuples, labels=list(range(self.num_classes)),
                             json_output_file=join(output_folder, "summary.json"), json_name=job_name,
                             json_author="Fabian", json_description="",
                             json_task=task)
コード例 #8
0
def ensemble(training_output_folder1, training_output_folder2, output_folder, task, validation_folder, folds, allow_ensembling: bool = True):
    print("\nEnsembling folders\n", training_output_folder1, "\n", training_output_folder2)

    output_folder_base = output_folder
    output_folder = join(output_folder_base, "ensembled_raw")

    # only_keep_largest_connected_component is the same for all stages
    dataset_directory = join(preprocessing_output_dir, task)
    plans = load_pickle(join(training_output_folder1, "plans.pkl"))  # we need this only for the labels

    files1 = []
    files2 = []
    property_files = []
    out_files = []
    gt_segmentations = []

    folder_with_gt_segs = join(dataset_directory, "gt_segmentations")
    # in the correct shape and we need the original geometry to restore the niftis

    for f in folds:
        validation_folder_net1 = join(training_output_folder1, "fold_%d" % f, validation_folder)
        validation_folder_net2 = join(training_output_folder2, "fold_%d" % f, validation_folder)

        if not isdir(validation_folder_net1):
            raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net1)
        if not isdir(validation_folder_net2):
            raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net2)

        # we need to ensure the validation was successful. We can verify this via the presence of the summary.json file
        if not isfile(join(validation_folder_net1, 'summary.json')):
            raise AssertionError("Validation directory incomplete: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net1)
        if not isfile(join(validation_folder_net2, 'summary.json')):
            raise AssertionError("Validation directory missing: %s. Please rerun validation with `nnUNet_train CONFIG TRAINER TASK FOLD -val --npz`" % validation_folder_net2)

        patient_identifiers1_npz = [i[:-4] for i in subfiles(validation_folder_net1, False, None, 'npz', True)]
        patient_identifiers2_npz = [i[:-4] for i in subfiles(validation_folder_net2, False, None, 'npz', True)]

        # we don't do postprocessing anymore so there should not be any of that noPostProcess
        patient_identifiers1_nii = [i[:-7] for i in subfiles(validation_folder_net1, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz')]
        patient_identifiers2_nii = [i[:-7] for i in subfiles(validation_folder_net2, False, None, suffix='nii.gz', sort=True) if not i.endswith("noPostProcess.nii.gz") and not i.endswith('_postprocessed.nii.gz')]

        if not all([i in patient_identifiers1_npz for i in patient_identifiers1_nii]):
            raise AssertionError("Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag." % (validation_folder_net1))
        if not all([i in patient_identifiers2_npz for i in patient_identifiers2_nii]):
            raise AssertionError("Missing npz files in folder %s. Please run the validation for all models and folds with the '--npz' flag." % (validation_folder_net2))

        patient_identifiers1_npz.sort()
        patient_identifiers2_npz.sort()

        assert all([i == j for i, j in zip(patient_identifiers1_npz, patient_identifiers2_npz)]), "npz filenames do not match. This should not happen."

        maybe_mkdir_p(output_folder)

        for p in patient_identifiers1_npz:
            files1.append(join(validation_folder_net1, p + '.npz'))
            files2.append(join(validation_folder_net2, p + '.npz'))
            property_files.append(join(validation_folder_net1, p) + ".pkl")
            out_files.append(join(output_folder, p + ".nii.gz"))
            gt_segmentations.append(join(folder_with_gt_segs, p + ".nii.gz"))

    p = Pool(default_num_threads)
    p.map(merge, zip(files1, files2, property_files, out_files))
    p.close()
    p.join()

    if not isfile(join(output_folder, "summary.json")) and len(out_files) > 0:
        aggregate_scores(tuple(zip(out_files, gt_segmentations)), labels=plans['all_classes'],
                     json_output_file=join(output_folder, "summary.json"), json_task=task,
                     json_name=task + "__" + Path(output_folder_base).parts[-1], num_threads=default_num_threads)

    if allow_ensembling and not isfile(join(output_folder_base, "postprocessing.json")):
        # now lets also look at postprocessing. We cannot just take what we determined in cross-validation and apply it
        # here because things may have changed and may also be too inconsistent between the two networks
        determine_postprocessing(output_folder_base, folder_with_gt_segs, "ensembled_raw", "temp",
                                 "ensembled_postprocessed", default_num_threads, dice_threshold=0)

        out_dir_all_json = join(network_training_output_dir, "summary_jsons")
        json_out = load_json(join(output_folder_base, "ensembled_postprocessed", "summary.json"))

        json_out["experiment_name"] = Path(output_folder_base).parts[-1]
        save_json(json_out, join(output_folder_base, "ensembled_postprocessed", "summary.json"))

        maybe_mkdir_p(out_dir_all_json)
        shutil.copy(join(output_folder_base, "ensembled_postprocessed", "summary.json"),
                    join(out_dir_all_json, "%s__%s.json" % (task, Path(output_folder_base).parts[-1])))
コード例 #9
0
    def validate(self,
                 do_mirroring: bool = True,
                 use_sliding_window: bool = True,
                 step_size: float = 0.5,
                 save_softmax: bool = True,
                 use_gaussian: bool = True,
                 overwrite: bool = True,
                 validation_folder_name: str = 'validation_raw',
                 debug: bool = False,
                 all_in_gpu: bool = False,
                 segmentation_export_kwargs: dict = None):

        current_mode = self.network.training
        self.network.eval()

        assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)"
        if self.dataset_val is None:
            self.load_dataset()
            self.do_split()

        if segmentation_export_kwargs is None:
            if 'segmentation_export_params' in self.plans.keys():
                force_separate_z = self.plans['segmentation_export_params'][
                    'force_separate_z']
                interpolation_order = self.plans['segmentation_export_params'][
                    'interpolation_order']
                interpolation_order_z = self.plans[
                    'segmentation_export_params']['interpolation_order_z']
            else:
                force_separate_z = None
                interpolation_order = 1
                interpolation_order_z = 0
        else:
            force_separate_z = segmentation_export_kwargs['force_separate_z']
            interpolation_order = segmentation_export_kwargs[
                'interpolation_order']
            interpolation_order_z = segmentation_export_kwargs[
                'interpolation_order_z']

        output_folder = join(self.output_folder, validation_folder_name)
        maybe_mkdir_p(output_folder)

        if do_mirroring:
            mirror_axes = self.data_aug_params['mirror_axes']
        else:
            mirror_axes = ()

        pred_gt_tuples = []

        export_pool = Pool(2)
        results = []

        transpose_backward = self.plans.get('transpose_backward')

        for k in self.dataset_val.keys():
            properties = load_pickle(self.dataset[k]['properties_file'])
            data = np.load(self.dataset[k]['data_file'])['data']

            # concat segmentation of previous step
            seg_from_prev_stage = np.load(
                join(self.folder_with_segs_from_prev_stage,
                     k + "_segFromPrevStage.npz"))['data'][None]

            print(data.shape)
            data[-1][data[-1] == -1] = 0
            data_for_net = np.concatenate(
                (data[:-1],
                 to_one_hot(seg_from_prev_stage[0], range(1,
                                                          self.num_classes))))

            softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(
                data_for_net,
                do_mirroring,
                mirror_axes,
                use_sliding_window,
                step_size,
                use_gaussian,
                all_in_gpu=all_in_gpu,
                mixed_precision=self.fp16)[1]

            if transpose_backward is not None:
                transpose_backward = self.plans.get('transpose_backward')
                softmax_pred = softmax_pred.transpose(
                    [0] + [i + 1 for i in transpose_backward])

            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]

            if save_softmax:
                softmax_fname = join(output_folder, fname + ".npz")
            else:
                softmax_fname = None
            """There is a problem with python process communication that prevents us from communicating obejcts 
            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
            filename or np.ndarray and will handle this automatically"""
            if np.prod(softmax_pred.shape) > (2e9 / 4 *
                                              0.85):  # *0.85 just to be save
                np.save(fname + ".npy", softmax_pred)
                softmax_pred = fname + ".npy"

            results.append(
                export_pool.starmap_async(
                    save_segmentation_nifti_from_softmax,
                    ((softmax_pred, join(output_folder, fname + ".nii.gz"),
                      properties, interpolation_order,
                      self.regions_class_order, None, None, softmax_fname,
                      None, force_separate_z, interpolation_order_z), )))

            pred_gt_tuples.append([
                join(output_folder, fname + ".nii.gz"),
                join(self.gt_niftis_folder, fname + ".nii.gz")
            ])

        _ = [i.get() for i in results]

        task = self.dataset_directory.split("/")[-1]
        job_name = self.experiment_name
        _ = aggregate_scores(pred_gt_tuples,
                             labels=list(range(self.num_classes)),
                             json_output_file=join(output_folder,
                                                   "summary.json"),
                             json_name=job_name,
                             json_author="Fabian",
                             json_description="",
                             json_task=task)

        # in the old nnunet we would stop here. Now we add a postprocessing. This postprocessing can remove everything
        # except the largest connected component for each class. To see if this improves results, we do this for all
        # classes and then rerun the evaluation. Those classes for which this resulted in an improved dice score will
        # have this applied during inference as well
        self.print_to_log_file("determining postprocessing")
        determine_postprocessing(self.output_folder,
                                 self.gt_niftis_folder,
                                 validation_folder_name,
                                 final_subf_name=validation_folder_name +
                                 "_postprocessed",
                                 debug=debug)
        # after this the final predictions for the vlaidation set can be found in validation_folder_name_base + "_postprocessed"
        # They are always in that folder, even if no postprocessing as applied!

        # detemining postprocesing on a per-fold basis may be OK for this fold but what if another fold finds another
        # postprocesing to be better? In this case we need to consolidate. At the time the consolidation is going to be
        # done we won't know what self.gt_niftis_folder was, so now we copy all the niftis into a separate folder to
        # be used later
        gt_nifti_folder = join(self.output_folder_base, "gt_niftis")
        maybe_mkdir_p(gt_nifti_folder)
        for f in subfiles(self.gt_niftis_folder, suffix=".nii.gz"):
            success = False
            attempts = 0
            while not success and attempts < 10:
                try:
                    shutil.copy(f, gt_nifti_folder)
                    success = True
                except OSError:
                    attempts += 1
                    sleep(1)

        self.network.train(current_mode)
        export_pool.close()
        export_pool.join()