def __init__(self, test=None, reference=None, labels=None, metrics=None, advanced_metrics=None, nan_for_nonexisting=True): self.test = None self.reference = None #------- added by Camila self.testcl = None self.referencecl = None # -------- end added by Camila self.confusion_matrix = ConfusionMatrix() self.labels = None self.nan_for_nonexisting = nan_for_nonexisting self.result = None self.metrics = [] if metrics is None: for m in self.default_metrics: self.metrics.append(m) else: for m in metrics: self.metrics.append(m) self.advanced_metrics = [] if advanced_metrics is None: for m in self.default_advanced_metrics: self.advanced_metrics.append(m) else: for m in advanced_metrics: self.advanced_metrics.append(m) self.set_reference(reference) self.set_test(test) if labels is not None: self.set_labels(labels) else: if test is not None and reference is not None: self.construct_labels()
def validate(self, do_mirroring=True, use_train_mode=False, tiled=True, step=2, save_softmax=True, use_gaussian=True, compute_global_dice=True, override=True, validation_folder_name='validation'): """ 2018_12_05: I added global accumulation of TP, FP and FN for the validation in here. This is because I believe that selecting models is easier when computing the Dice globally instead of independently for each case and then averaging over cases. The Lung dataset in particular is very unstable because of the small size of the Lung Lesions. My theory is that even though the global Dice is different than the acutal target metric it is still a good enough substitute that allows us to get a lot more stable results when rerunning the same experiment twice. FYI: computer vision community uses the global jaccard for the evaluation of Cityscapes etc, not the per-image jaccard averaged over images. The reason I am accumulating TP/FP/FN here and not from the nifti files (which are used by our Evaluator) is that all predictions made here will have identical voxel spacing whereas voxel spacings in the nifti files will be different (which we could compensate for by using the volume per voxel but that would require the evaluator to understand spacings which is does not at this point) :param do_mirroring: :param use_train_mode: :param mirror_axes: :param tiled: :param tile_in_z: :param step: :param use_nifti: :param save_softmax: :param use_gaussian: :param use_temporal_models: :return: """ assert self.was_initialized, "must initialize, ideally with checkpoint (or train first)" if self.dataset_val is None: self.load_dataset() self.do_split() output_folder = join(self.output_folder, validation_folder_name) maybe_mkdir_p(output_folder) if do_mirroring: mirror_axes = self.data_aug_params['mirror_axes'] else: mirror_axes = () pred_gt_tuples = [] export_pool = Pool(4) results = [] global_tp = OrderedDict() global_fp = OrderedDict() global_fn = OrderedDict() for k in self.dataset_val.keys(): print(k) properties = self.dataset[k]['properties'] fname = properties['list_of_data_files'][0].split("/")[-1][:-12] if override or (not isfile(join(output_folder, fname + ".nii.gz"))): data = np.load(self.dataset[k]['data_file'])['data'] print(k, data.shape) data[-1][data[-1] == -1] = 0 softmax_pred = self.predict_preprocessed_data_return_softmax( data[:-1], do_mirroring, 1, use_train_mode, 1, mirror_axes, tiled, True, step, self.patch_size, use_gaussian=use_gaussian) if compute_global_dice: predicted_segmentation = softmax_pred.argmax(0) gt_segmentation = data[-1] labels = properties['classes'] labels = [int(i) for i in labels if i > 0] for l in labels: if l not in global_fn.keys(): global_fn[l] = 0 if l not in global_fp.keys(): global_fp[l] = 0 if l not in global_tp.keys(): global_tp[l] = 0 conf = ConfusionMatrix( (predicted_segmentation == l).astype(int), (gt_segmentation == l).astype(int)) conf.compute() global_fn[l] += conf.fn global_fp[l] += conf.fp global_tp[l] += conf.tp softmax_pred = softmax_pred.transpose( [0] + [i + 1 for i in self.transpose_backward]) if save_softmax: softmax_fname = join(output_folder, fname + ".npz") else: softmax_fname = None """There is a problem with python process communication that prevents us from communicating obejcts larger than 2 GB between processes (basically when the length of the pickle string that will be sent is communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either filename or np.ndarray and will handle this automatically""" if np.prod(softmax_pred.shape) > (2e9 / 4 * 0.9): # *0.9 just to be save np.save(join(output_folder, fname + ".npy"), softmax_pred) softmax_pred = join(output_folder, fname + ".npy") results.append( export_pool.starmap_async( save_segmentation_nifti_from_softmax, ((softmax_pred, join(output_folder, fname + ".nii.gz"), properties, 3, None, None, None, softmax_fname, None), ))) # save_segmentation_nifti_from_softmax(softmax_pred, join(output_folder, fname + ".nii.gz"), # properties, 3, None, None, # None, # softmax_fname, # None) pred_gt_tuples.append([ join(output_folder, fname + ".nii.gz"), join(self.gt_niftis_folder, fname + ".nii.gz") ]) _ = [i.get() for i in results] print("finished prediction, now evaluating...") task = self.dataset_directory.split("/")[-1] job_name = self.experiment_name _ = aggregate_scores( pred_gt_tuples, labels=list(range(self.num_classes)), json_output_file=join(output_folder, "summary.json"), json_name=job_name + " val tiled %s" % (str(tiled)), json_author="Fabian", json_task=task, num_threads=3) if compute_global_dice: global_dice = OrderedDict() all_labels = list(global_fn.keys()) for l in all_labels: global_dice[int(l)] = float( 2 * global_tp[l] / (2 * global_tp[l] + global_fn[l] + global_fp[l])) write_json(global_dice, join(output_folder, "global_dice.json"))
class Evaluator: """Object that holds test and reference segmentations with label information and computes a number of metrics on the two. 'labels' must either be an iterable of numeric values (or tuples thereof) or a dictionary with string names and numeric values. """ default_metrics = [ "False Positive Rate", "Dice", "Jaccard", "Precision", "Recall", "Accuracy", "False Omission Rate", "Negative Predictive Value", "False Negative Rate", "True Negative Rate", "False Discovery Rate", "Total Positives Test", "Total Positives Reference" ] default_advanced_metrics = [ "Hausdorff Distance", "Hausdorff Distance 95", "Avg. Surface Distance", "Avg. Symmetric Surface Distance" ] def __init__(self, test=None, reference=None, labels=None, metrics=None, advanced_metrics=None, nan_for_nonexisting=True, use_label=None): self.test = None self.reference = None self.confusion_matrix = ConfusionMatrix() self.labels = None self.nan_for_nonexisting = nan_for_nonexisting self.result = None self.use_label = None self.metrics = [] if metrics is None: for m in self.default_metrics: self.metrics.append(m) else: for m in metrics: self.metrics.append(m) self.advanced_metrics = [] if advanced_metrics is None: for m in self.default_advanced_metrics: self.advanced_metrics.append(m) else: for m in advanced_metrics: self.advanced_metrics.append(m) self.set_reference(reference) self.set_test(test) if labels is not None: self.set_labels(labels) else: if test is not None and reference is not None: self.construct_labels() def set_test(self, test): """Set the test segmentation.""" self.test = test def set_reference(self, reference): """Set the reference segmentation.""" self.reference = reference def set_use_label(self, use_label): """Set which label use for evaluation""" self.use_label = use_label def set_labels(self, labels): """Set the labels. :param labels= may be a dictionary (int->str), a set (of ints), a tuple (of ints) or a list (of ints). Labels will only have names if you pass a dictionary""" if isinstance(labels, dict): self.labels = collections.OrderedDict(labels) elif isinstance(labels, set): self.labels = list(labels) elif isinstance(labels, np.ndarray): self.labels = [i for i in labels] elif isinstance(labels, (list, tuple)): self.labels = labels else: raise TypeError( "Can only handle dict, list, tuple, set & numpy array, but input is of type {}" .format(type(labels))) def construct_labels(self): """Construct label set from unique entries in segmentations.""" if self.test is None and self.reference is None: raise ValueError("No test or reference segmentations.") elif self.test is None: labels = np.unique(self.reference) else: labels = np.union1d(np.unique(self.test), np.unique(self.reference)) self.labels = list(map(lambda x: int(x), labels)) def set_metrics(self, metrics): """Set evaluation metrics""" if isinstance(metrics, set): self.metrics = list(metrics) elif isinstance(metrics, (list, tuple, np.ndarray)): self.metrics = metrics else: raise TypeError( "Can only handle list, tuple, set & numpy array, but input is of type {}" .format(type(metrics))) def add_metric(self, metric): if metric not in self.metrics: self.metrics.append(metric) def evaluate(self, test=None, reference=None, advanced=False, **metric_kwargs): """Compute metrics for segmentations.""" if test is not None: self.set_test(test) if reference is not None: self.set_reference(reference) if self.test is None or self.reference is None: raise ValueError("Need both test and reference segmentations.") if self.labels is None: self.construct_labels() if self.use_label is not None: if self.use_label == "organ": self.reference = (self.reference == 1).astype(int) elif self.use_label == "tumor": self.reference = (self.reference == 2).astype(int) elif self.use_label == "both": self.reference = (self.reference >= 1).astype(int) self.metrics.sort() # get functions for evaluation # somewhat convoluted, but allows users to define additonal metrics # on the fly, e.g. inside an IPython console _funcs = { m: ALL_METRICS[m] for m in self.metrics + self.advanced_metrics } frames = inspect.getouterframes(inspect.currentframe()) for metric in self.metrics: for f in frames: if metric in f[0].f_locals: _funcs[metric] = f[0].f_locals[metric] break else: if metric in _funcs: continue else: raise NotImplementedError( "Metric {} not implemented.".format(metric)) # get results self.result = OrderedDict() eval_metrics = self.metrics if advanced: eval_metrics += self.advanced_metrics if isinstance(self.labels, dict): for label, name in self.labels.items(): k = str(name) self.result[k] = OrderedDict() if not hasattr(label, "__iter__"): self.confusion_matrix.set_test(self.test == label) self.confusion_matrix.set_reference( self.reference == label) else: current_test = 0 current_reference = 0 for l in label: current_test += (self.test == l) current_reference += (self.reference == l) self.confusion_matrix.set_test(current_test) self.confusion_matrix.set_reference(current_reference) for metric in eval_metrics: self.result[k][metric] = _funcs[metric]( confusion_matrix=self.confusion_matrix, nan_for_nonexisting=self.nan_for_nonexisting, **metric_kwargs) else: for i, l in enumerate(self.labels): k = str(l) self.result[k] = OrderedDict() self.confusion_matrix.set_test(self.test == l) self.confusion_matrix.set_reference(self.reference == l) for metric in eval_metrics: self.result[k][metric] = _funcs[metric]( confusion_matrix=self.confusion_matrix, nan_for_nonexisting=self.nan_for_nonexisting, **metric_kwargs) return self.result def to_dict(self): if self.result is None: self.evaluate() return self.result def to_array(self): """Return result as numpy array (labels x metrics).""" if self.result is None: self.evaluate result_metrics = sorted(self.result[list( self.result.keys())[0]].keys()) a = np.zeros((len(self.labels), len(result_metrics)), dtype=np.float32) if isinstance(self.labels, dict): for i, label in enumerate(self.labels.keys()): for j, metric in enumerate(result_metrics): a[i][j] = self.result[self.labels[label]][metric] else: for i, label in enumerate(self.labels): for j, metric in enumerate(result_metrics): a[i][j] = self.result[label][metric] return a def to_pandas(self): """Return result as pandas DataFrame.""" a = self.to_array() if isinstance(self.labels, dict): labels = list(self.labels.values()) else: labels = self.labels result_metrics = sorted(self.result[list( self.result.keys())[0]].keys()) return pd.DataFrame(a, index=labels, columns=result_metrics)