Exemple #1
0
def get_new_subject_split(path_folder, center_test, split_method, random_seed,
                          train_frac, test_frac, log_directory, balance):
    """Randomly split dataset between training / validation / testing.

    Randomly split dataset between training / validation / testing\
        and save it in log_directory + "/split_datasets.joblib".

    Args:
        path_folder (string): Dataset folder.
        center_test (list): List of centers to include in the testing set.
        split_method (string): See imed_loader_utils.split_dataset.
        random_seed (int): Random seed.
        train_frac (float): Training dataset proportion, between 0 and 1.
        test_frac (float): Testing dataset proportionm between 0 and 1.
        log_directory (string): Output folder.
        balance (string): Metadata contained in "participants.tsv" file with categorical values. Each category will be
        evenly distributed in the training, validation and testing datasets.

    Returns:
        list, list list: Training, validation and testing subjects lists.
    """
    # read participants.tsv as pandas dataframe
    df = bids.BIDS(path_folder).participants.content

    # If balance, then split the dataframe for each categorical value of the "balance" column
    if balance:
        if balance in df.keys():
            df_list = [
                df[df[balance] == k] for k in df[balance].unique().tolist()
            ]
        else:
            logger.warning(
                "No column named '{}' was found in 'participants.tsv' file. Not taken into account to split "
                "the dataset.".format(balance))
            df_list = [df]
    else:
        df_list = [df]

    train_lst, valid_lst, test_lst = [], [], []
    for df_tmp in df_list:
        # Split dataset on each section of subjects
        train_tmp, valid_tmp, test_tmp = split_dataset(
            df=df_tmp,
            center_test_lst=center_test,
            split_method=split_method,
            random_seed=random_seed,
            train_frac=train_frac,
            test_frac=test_frac)
        # Update the dataset lists
        train_lst += train_tmp
        valid_lst += valid_tmp
        test_lst += test_tmp

    # save the subject distribution
    split_dct = {'train': train_lst, 'valid': valid_lst, 'test': test_lst}
    split_path = os.path.join(log_directory, "split_datasets.joblib")
    joblib.dump(split_dct, split_path)

    return train_lst, valid_lst, test_lst
Exemple #2
0
def run_command(context, n_gif=0, thr_increment=None, resume_training=False):
    """Run main command.

    This function is central in the ivadomed project as training / testing / evaluation commands are run via this
    function. All the process parameters are defined in the config.

    Args:
        context (dict): Dictionary containing all parameters that are needed for a given process. See
            :doc:`configuration_file` for more details.
        n_gif (int): Generates a GIF during training if larger than zero, one frame per epoch for a given slice. The
            parameter indicates the number of 2D slices used to generate GIFs, one GIF per slice. A GIF shows
            predictions of a given slice from the validation sub-dataset. They are saved within the log directory.
        thr_increment (float): A threshold analysis is performed at the end of the training using the trained model and
            the training + validation sub-dataset to find the optimal binarization threshold. The specified value
            indicates the increment between 0 and 1 used during the ROC analysis (e.g. 0.1).
        resume_training (bool): Load a saved model ("checkpoint.pth.tar" in the log_directory) for resume training.
            This training state is saved everytime a new best model is saved in the log
            directory.

    Returns:
        Float or pandas Dataframe:
        If "train" command: Returns floats: best loss score for both training and validation.
        If "test" command: Returns a pandas Dataframe: of metrics computed for each subject of the testing
            sub dataset and return the prediction metrics before evaluation.
        If "segment" command: No return value.
    """
    command = copy.deepcopy(context["command"])
    log_directory = copy.deepcopy(context["log_directory"])
    if not os.path.isdir(log_directory):
        print('Creating log directory: {}'.format(log_directory))
        os.makedirs(log_directory)
    else:
        print('Log directory already exists: {}'.format(log_directory))

    # Define device
    cuda_available, device = imed_utils.define_device(context['gpu'])

    # Get subject lists
    train_lst, valid_lst, test_lst = imed_loader_utils.get_subdatasets_subjects_list(
        context["split_dataset"], context['loader_parameters']['bids_path'],
        log_directory)

    # Loader params
    loader_params = copy.deepcopy(context["loader_parameters"])
    if command == "train":
        loader_params["contrast_params"]["contrast_lst"] = loader_params[
            "contrast_params"]["training_validation"]
    else:
        loader_params["contrast_params"]["contrast_lst"] = loader_params[
            "contrast_params"]["testing"]
    if "FiLMedUnet" in context and context["FiLMedUnet"]["applied"]:
        loader_params.update(
            {"metadata_type": context["FiLMedUnet"]["metadata"]})

    # Get transforms for each subdataset
    transform_train_params, transform_valid_params, transform_test_params = \
        imed_transforms.get_subdatasets_transforms(context["transformation"])

    # MODEL PARAMETERS
    model_params = copy.deepcopy(context["default_model"])
    model_params["folder_name"] = copy.deepcopy(context["model_name"])
    model_context_list = [
        model_name for model_name in MODEL_LIST
        if model_name in context and context[model_name]["applied"]
    ]
    if len(model_context_list) == 1:
        model_params["name"] = model_context_list[0]
        model_params.update(context[model_context_list[0]])
    elif 'Modified3DUNet' in model_context_list and 'FiLMedUnet' in model_context_list and len(
            model_context_list) == 2:
        model_params["name"] = 'Modified3DUNet'
        for i in range(len(model_context_list)):
            model_params.update(context[model_context_list[i]])
    elif len(model_context_list) > 1:
        print(
            'ERROR: Several models are selected in the configuration file: {}.'
            'Please select only one (i.e. only one where: "applied": true).'.
            format(model_context_list))
        exit()

    model_params['is_2d'] = False if "Modified3DUNet" in model_params[
        'name'] else model_params['is_2d']
    # Get in_channel from contrast_lst
    if loader_params["multichannel"]:
        model_params["in_channel"] = len(
            loader_params["contrast_params"]["contrast_lst"])
    else:
        model_params["in_channel"] = 1
    # Get out_channel from target_suffix
    model_params["out_channel"] = len(loader_params["target_suffix"])
    # If multi-class output, then add background class
    if model_params["out_channel"] > 1:
        model_params.update({"out_channel": model_params["out_channel"] + 1})
    # Display for spec' check
    imed_utils.display_selected_model_spec(params=model_params)
    # Update loader params
    if 'object_detection_params' in context:
        object_detection_params = context['object_detection_params']
        object_detection_params.update({
            "gpu":
            context['gpu'],
            "log_directory":
            context['log_directory']
        })
        loader_params.update(
            {"object_detection_params": object_detection_params})

    loader_params.update({"model_params": model_params})

    # TESTING PARAMS
    # Aleatoric uncertainty
    if context['uncertainty'][
            'aleatoric'] and context['uncertainty']['n_it'] > 0:
        transformation_dict = transform_train_params
    else:
        transformation_dict = transform_test_params
    undo_transforms = imed_transforms.UndoCompose(
        imed_transforms.Compose(transformation_dict, requires_undo=True))
    testing_params = copy.deepcopy(context["training_parameters"])
    testing_params.update({'uncertainty': context["uncertainty"]})
    testing_params.update({
        'target_suffix': loader_params["target_suffix"],
        'undo_transforms': undo_transforms,
        'slice_axis': loader_params['slice_axis']
    })
    if command == "train":
        imed_utils.display_selected_transfoms(transform_train_params,
                                              dataset_type=["training"])
        imed_utils.display_selected_transfoms(transform_valid_params,
                                              dataset_type=["validation"])
    elif command == "test":
        imed_utils.display_selected_transfoms(transformation_dict,
                                              dataset_type=["testing"])

    if command == 'train':
        # LOAD DATASET
        # Get Validation dataset
        ds_valid = imed_loader.load_dataset(**{
            **loader_params,
            **{
                'data_list': valid_lst,
                'transforms_params': transform_valid_params,
                'dataset_type': 'validation'
            }
        },
                                            device=device,
                                            cuda_available=cuda_available)
        # Get Training dataset
        ds_train = imed_loader.load_dataset(**{
            **loader_params,
            **{
                'data_list': train_lst,
                'transforms_params': transform_train_params,
                'dataset_type': 'training'
            }
        },
                                            device=device,
                                            cuda_available=cuda_available)

        metric_fns = imed_metrics.get_metric_fns(ds_train.task)

        # If FiLM, normalize data
        if 'film_layers' in model_params and any(model_params['film_layers']):
            # Normalize metadata before sending to the FiLM network
            results = imed_film.get_film_metadata_models(
                ds_train=ds_train,
                metadata_type=model_params['metadata'],
                debugging=context["debugging"])
            ds_train, train_onehotencoder, metadata_clustering_models = results
            ds_valid = imed_film.normalize_metadata(
                ds_valid, metadata_clustering_models, context["debugging"],
                model_params['metadata'])
            model_params.update({
                "film_onehotencoder":
                train_onehotencoder,
                "n_metadata":
                len([ll for l in train_onehotencoder.categories_ for ll in l])
            })
            joblib.dump(metadata_clustering_models,
                        "./" + log_directory + "/clustering_models.joblib")
            joblib.dump(train_onehotencoder,
                        "./" + log_directory + "/one_hot_encoder.joblib")

        # Model directory
        path_model = os.path.join(log_directory, context["model_name"])
        if not os.path.isdir(path_model):
            print('Creating model directory: {}'.format(path_model))
            os.makedirs(path_model)
            if 'film_layers' in model_params and any(
                    model_params['film_layers']):
                joblib.dump(train_onehotencoder,
                            os.path.join(path_model, "one_hot_encoder.joblib"))
                if 'metadata_dict' in ds_train[0]['input_metadata'][0]:
                    metadata_dict = ds_train[0]['input_metadata'][0][
                        'metadata_dict']
                    joblib.dump(
                        metadata_dict,
                        os.path.join(path_model, "metadata_dict.joblib"))

        else:
            print('Model directory already exists: {}'.format(path_model))

        # RUN TRAINING
        best_training_dice, best_training_loss, best_validation_dice, best_validation_loss = imed_training.train(
            model_params=model_params,
            dataset_train=ds_train,
            dataset_val=ds_valid,
            training_params=context["training_parameters"],
            log_directory=log_directory,
            device=device,
            cuda_available=cuda_available,
            metric_fns=metric_fns,
            n_gif=n_gif,
            resume_training=resume_training,
            debugging=context["debugging"])

    if thr_increment:
        # LOAD DATASET
        if command != 'train':  # If command == train, then ds_valid already load
            # Get Validation dataset
            ds_valid = imed_loader.load_dataset(**{
                **loader_params,
                **{
                    'data_list': valid_lst,
                    'transforms_params': transform_valid_params,
                    'dataset_type': 'validation'
                }
            },
                                                device=device,
                                                cuda_available=cuda_available)
        # Get Training dataset with no Data Augmentation
        ds_train = imed_loader.load_dataset(**{
            **loader_params,
            **{
                'data_list': train_lst,
                'transforms_params': transform_valid_params,
                'dataset_type': 'training'
            }
        },
                                            device=device,
                                            cuda_available=cuda_available)

        # Choice of optimisation metric
        metric = "recall_specificity" if model_params[
            "name"] in imed_utils.CLASSIFIER_LIST else "dice"
        # Model path
        model_path = os.path.join(log_directory, "best_model.pt")
        # Run analysis
        thr = imed_testing.threshold_analysis(model_path=model_path,
                                              ds_lst=[ds_train, ds_valid],
                                              model_params=model_params,
                                              testing_params=testing_params,
                                              metric=metric,
                                              increment=thr_increment,
                                              fname_out=os.path.join(
                                                  log_directory, "roc.png"),
                                              cuda_available=cuda_available)

        # Update threshold in config file
        context["postprocessing"]["binarize_prediction"] = {"thr": thr}

    if command == 'train':
        # Save config file within log_directory and log_directory/model_name
        # Done after the threshold_analysis to propate this info in the config files
        with open(os.path.join(log_directory, "config_file.json"), 'w') as fp:
            json.dump(context, fp, indent=4)
        with open(
                os.path.join(log_directory, context["model_name"],
                             context["model_name"] + ".json"), 'w') as fp:
            json.dump(context, fp, indent=4)

        return best_training_dice, best_training_loss, best_validation_dice, best_validation_loss

    if command == 'test':
        # LOAD DATASET
        ds_test = imed_loader.load_dataset(**{
            **loader_params,
            **{
                'data_list': test_lst,
                'transforms_params': transformation_dict,
                'dataset_type': 'testing',
                'requires_undo': True
            }
        },
                                           device=device,
                                           cuda_available=cuda_available)

        metric_fns = imed_metrics.get_metric_fns(ds_test.task)

        if 'film_layers' in model_params and any(model_params['film_layers']):
            clustering_path = os.path.join(log_directory,
                                           "clustering_models.joblib")
            metadata_clustering_models = joblib.load(clustering_path)
            ohe_path = os.path.join(log_directory, "one_hot_encoder.joblib")
            one_hot_encoder = joblib.load(ohe_path)
            ds_test = imed_film.normalize_metadata(ds_test,
                                                   metadata_clustering_models,
                                                   context["debugging"],
                                                   model_params['metadata'])
            model_params.update({
                "film_onehotencoder":
                one_hot_encoder,
                "n_metadata":
                len([ll for l in one_hot_encoder.categories_ for ll in l])
            })

        # RUN INFERENCE
        pred_metrics = imed_testing.test(
            model_params=model_params,
            dataset_test=ds_test,
            testing_params=testing_params,
            log_directory=log_directory,
            device=device,
            cuda_available=cuda_available,
            metric_fns=metric_fns,
            postprocessing=context['postprocessing'])

        # RUN EVALUATION
        df_results = imed_evaluation.evaluate(
            bids_path=loader_params['bids_path'],
            log_directory=log_directory,
            target_suffix=loader_params["target_suffix"],
            eval_params=context["evaluation_parameters"])
        return df_results, pred_metrics

    if command == 'segment':
        bids_ds = bids.BIDS(context["loader_parameters"]["bids_path"])
        df = bids_ds.participants.content
        subj_lst = df['participant_id'].tolist()
        bids_subjects = [
            s for s in bids_ds.get_subjects()
            if s.record["subject_id"] in subj_lst
        ]

        # Add postprocessing to packaged model
        path_model = os.path.join(context['log_directory'],
                                  context['model_name'])
        path_model_config = os.path.join(path_model,
                                         context['model_name'] + ".json")
        model_config = imed_config_manager.load_json(path_model_config)
        model_config['postprocessing'] = context['postprocessing']
        with open(path_model_config, 'w') as fp:
            json.dump(model_config, fp, indent=4)

        options = None
        for subject in bids_subjects:
            fname_img = subject.record["absolute_path"]
            if 'film_layers' in model_params and any(
                    model_params['film_layers']) and model_params['metadata']:
                subj_id = subject.record['subject_id']
                metadata = df[df['participant_id'] == subj_id][
                    model_params['metadata']].values[0]
                options = {'metadata': metadata}
            pred = imed_inference.segment_volume(path_model,
                                                 fname_image=fname_img,
                                                 gpu_number=context['gpu'],
                                                 options=options)
            pred_path = os.path.join(context['log_directory'], "pred_masks")
            if not os.path.exists(pred_path):
                os.makedirs(pred_path)
            filename = subject.record['subject_id'] + "_" + subject.record[
                'modality'] + "_pred" + ".nii.gz"
            nib.save(pred, os.path.join(pred_path, filename))
Exemple #3
0
def run_segment_command(context, model_params):
    bids_ds = []
    path_data = imed_utils.format_path_data(
        context["loader_parameters"]["path_data"])
    for bids_folder in path_data:
        bids_ds.append(bids.BIDS(bids_folder))

    # Get the merged df from all dataset paths
    df = imed_loader_utils.merge_bids_datasets(path_data)
    subj_lst = df['participant_id'].tolist()

    # Append subjects from all BIDSdatasets into a list
    bids_subjects = []
    for i_bids_folder in range(0, len(path_data)):
        bids_subjects += [
            s for s in bids_ds[i_bids_folder].get_subjects()
            if s.record["subject_id"] in subj_lst
        ]

    # Add postprocessing to packaged model
    path_model = os.path.join(context['path_output'], context['model_name'])
    path_model_config = os.path.join(path_model,
                                     context['model_name'] + ".json")
    model_config = imed_config_manager.load_json(path_model_config)
    model_config['postprocessing'] = context['postprocessing']
    with open(path_model_config, 'w') as fp:
        json.dump(model_config, fp, indent=4)

    options = None
    for subject in bids_subjects:
        if context['loader_parameters']['multichannel']:
            fname_img = []
            provided_contrasts = []
            contrasts = context['loader_parameters']['contrast_params'][
                'testing']
            # Keep contrast order
            for c in contrasts:
                for s in bids_subjects:
                    if subject.record['subject_id'] == s.record[
                            'subject_id'] and s.record['modality'] == c:
                        provided_contrasts.append(c)
                        fname_img.append(s.record['absolute_path'])
                        bids_subjects.remove(s)
            if len(fname_img) != len(contrasts):
                logger.warning(
                    "Missing contrast for subject {}. {} were provided but {} are required. Skipping "
                    "subject.".format(subject.record['subject_id'],
                                      provided_contrasts, contrasts))
                continue
        else:
            fname_img = [subject.record['absolute_path']]

        if 'film_layers' in model_params and any(
                model_params['film_layers']) and model_params['metadata']:
            subj_id = subject.record['subject_id']
            metadata = df[df['participant_id'] == subj_id][
                model_params['metadata']].values[0]
            options = {'metadata': metadata}
        pred_list, target_list = imed_inference.segment_volume(
            path_model,
            fname_images=fname_img,
            gpu_id=context['gpu_ids'][0],
            options=options)
        pred_path = os.path.join(context['path_output'], "pred_masks")
        if not os.path.exists(pred_path):
            os.makedirs(pred_path)

        for pred, target in zip(pred_list, target_list):
            filename = subject.record['subject_id'] + "_" + subject.record['modality'] + target + "_pred" + \
                        ".nii.gz"
            nib.save(pred, os.path.join(pred_path, filename))
Exemple #4
0
    def __init__(self,
                 root_dir,
                 subject_lst,
                 target_suffix,
                 contrast_params,
                 slice_axis=2,
                 cache=True,
                 transform=None,
                 metadata_choice=False,
                 slice_filter_fn=None,
                 roi_params=None,
                 multichannel=False,
                 object_detection_params=None,
                 task="segmentation",
                 soft_gt=False):

        self.bids_ds = bids.BIDS(root_dir)
        self.roi_params = roi_params if roi_params is not None else {
            "suffix": None,
            "slice_filter_roi": None
        }
        self.soft_gt = soft_gt
        self.filename_pairs = []
        if metadata_choice == 'mri_params':
            self.metadata = {
                "FlipAngle": [],
                "RepetitionTime": [],
                "EchoTime": [],
                "Manufacturer": []
            }

        bids_subjects = [
            s for s in self.bids_ds.get_subjects()
            if s.record["subject_id"] in subject_lst
        ]

        # Create a list with the filenames for all contrasts and subjects
        subjects_tot = []
        for subject in bids_subjects:
            subjects_tot.append(str(subject.record["absolute_path"]))

        # Create a dictionary with the number of subjects for each contrast of contrast_balance

        tot = {
            contrast:
            len([s for s in bids_subjects if s.record["modality"] == contrast])
            for contrast in contrast_params["balance"].keys()
        }

        # Create a counter that helps to balance the contrasts
        c = {contrast: 0 for contrast in contrast_params["balance"].keys()}

        multichannel_subjects = {}
        if multichannel:
            num_contrast = len(contrast_params["contrast_lst"])
            idx_dict = {}
            for idx, contrast in enumerate(contrast_params["contrast_lst"]):
                idx_dict[contrast] = idx
            multichannel_subjects = {
                subject: {
                    "absolute_paths": [None] * num_contrast,
                    "deriv_path": None,
                    "roi_filename": None,
                    "metadata": [None] * num_contrast
                }
                for subject in subject_lst
            }

        bounding_box_dict = imed_obj_detect.load_bounding_boxes(
            object_detection_params, self.bids_ds.get_subjects(), slice_axis,
            contrast_params["contrast_lst"])

        for subject in tqdm(bids_subjects, desc="Loading dataset"):
            if subject.record["modality"] in contrast_params["contrast_lst"]:
                # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance
                if subject.record["modality"] in contrast_params[
                        "balance"].keys():
                    c[subject.
                      record["modality"]] = c[subject.record["modality"]] + 1
                    if c[subject.record["modality"]] / tot[subject.record[
                            "modality"]] > contrast_params["balance"][
                                subject.record["modality"]]:
                        continue

                if not subject.has_derivative("labels"):
                    print("Subject without derivative, skipping.")
                    continue
                derivatives = subject.get_derivatives("labels")
                target_filename, roi_filename = [None
                                                 ] * len(target_suffix), None

                for deriv in derivatives:
                    for idx, suffix in enumerate(target_suffix):
                        if deriv.endswith(subject.record["modality"] + suffix +
                                          ".nii.gz"):
                            target_filename[idx] = deriv

                    if not (self.roi_params["suffix"] is None) and \
                            deriv.endswith(subject.record["modality"] + self.roi_params["suffix"] + ".nii.gz"):
                        roi_filename = [deriv]

                if (not any(target_filename)) or (
                        not (self.roi_params["suffix"] is None) and
                    (roi_filename is None)):
                    continue

                if not subject.has_metadata():
                    metadata = {}
                else:
                    metadata = subject.metadata()

                # add contrast to metadata
                metadata['contrast'] = subject.record["modality"]

                if len(bounding_box_dict):
                    # Take only one bounding box for cropping
                    metadata['bounding_box'] = bounding_box_dict[str(
                        subject.record["absolute_path"])][0]

                if metadata_choice == 'mri_params':
                    if not all([
                            imed_film.check_isMRIparam(m, metadata, subject,
                                                       self.metadata)
                            for m in self.metadata.keys()
                    ]):
                        continue

                elif metadata_choice and metadata_choice != 'contrasts' and metadata_choice is not None:
                    # add custom data to metadata
                    subject_id = subject.record["subject_id"]
                    df = bids.BIDS(root_dir).participants.content
                    if metadata_choice not in df.columns:
                        raise ValueError(
                            "The following metadata cannot be found in participants.tsv file: {}. "
                            "Invalid metadata choice.".format(metadata_choice))

                    metadata[metadata_choice] = df[
                        df['participant_id'] ==
                        subject_id][metadata_choice].values[0]

                    # Create metadata dict for OHE
                    data_lst = sorted(set(df[metadata_choice].values))
                    metadata_dict = {}
                    for idx, data in enumerate(data_lst):
                        metadata_dict[data] = idx

                    metadata['metadata_dict'] = metadata_dict

                # Fill multichannel dictionary
                if multichannel:
                    idx = idx_dict[subject.record["modality"]]
                    subj_id = subject.record["subject_id"]
                    multichannel_subjects[subj_id]["absolute_paths"][
                        idx] = subject.record.absolute_path
                    multichannel_subjects[subj_id][
                        "deriv_path"] = target_filename
                    multichannel_subjects[subj_id]["metadata"][idx] = metadata
                    if roi_filename:
                        multichannel_subjects[subj_id][
                            "roi_filename"] = roi_filename

                else:
                    self.filename_pairs.append(
                        ([subject.record.absolute_path], target_filename,
                         roi_filename, [metadata]))

        if multichannel:
            for subject in multichannel_subjects.values():
                if None not in subject["absolute_paths"]:
                    self.filename_pairs.append(
                        (subject["absolute_paths"], subject["deriv_path"],
                         subject["roi_filename"], subject["metadata"]))

        super().__init__(self.filename_pairs, slice_axis, cache, transform,
                         slice_filter_fn, task, self.roi_params, self.soft_gt)
Exemple #5
0
    def __init__(self,
                 root_dir,
                 subject_lst,
                 target_suffix,
                 contrast_lst,
                 hdf5_name,
                 contrast_balance=None,
                 slice_axis=2,
                 metadata_choice=False,
                 slice_filter_fn=None,
                 roi_params=None,
                 transform=None,
                 object_detection_params=None,
                 soft_gt=False):
        print("Starting conversion")
        # Getting all patients id
        self.bids_ds = bids.BIDS(root_dir)
        bids_subjects = [
            s for s in self.bids_ds.get_subjects()
            if s.record["subject_id"] in subject_lst
        ]
        self.soft_gt = soft_gt
        self.dt = h5py.special_dtype(vlen=str)
        # opening an hdf5 file with write access and writing metadata
        self.hdf5_file = h5py.File(hdf5_name, "w")

        list_patients = []

        self.filename_pairs = []

        if metadata_choice == 'mri_params':
            self.metadata = {
                "FlipAngle": [],
                "RepetitionTime": [],
                "EchoTime": [],
                "Manufacturer": []
            }

        self.prepro_transforms, self.transform = transform
        # Create a list with the filenames for all contrasts and subjects
        subjects_tot = []
        for subject in bids_subjects:
            subjects_tot.append(str(subject.record["absolute_path"]))

        # Create a dictionary with the number of subjects for each contrast of contrast_balance
        tot = {
            contrast:
            len([s for s in bids_subjects if s.record["modality"] == contrast])
            for contrast in contrast_balance.keys()
        }

        # Create a counter that helps to balance the contrasts
        c = {contrast: 0 for contrast in contrast_balance.keys()}

        self.has_bounding_box = True
        bounding_box_dict = imed_obj_detect.load_bounding_boxes(
            object_detection_params, self.bids_ds.get_subjects(), slice_axis,
            contrast_lst)

        for subject in tqdm(bids_subjects, desc="Loading dataset"):

            if subject.record["modality"] in contrast_lst:

                # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance
                if subject.record["modality"] in contrast_balance.keys():
                    c[subject.
                      record["modality"]] = c[subject.record["modality"]] + 1
                    if c[subject.record["modality"]] / tot[subject.record["modality"]] \
                            > contrast_balance[subject.record["modality"]]:
                        continue

                if not subject.has_derivative("labels"):
                    print("Subject without derivative, skipping.")
                    continue
                derivatives = subject.get_derivatives("labels")

                target_filename, roi_filename = [None
                                                 ] * len(target_suffix), None

                for deriv in derivatives:
                    for idx, suffix in enumerate(target_suffix):
                        if deriv.endswith(subject.record["modality"] + suffix +
                                          ".nii.gz"):
                            target_filename[idx] = deriv

                    if not (roi_params["suffix"] is None) and \
                            deriv.endswith(subject.record["modality"] + roi_params["suffix"] + ".nii.gz"):
                        roi_filename = [deriv]

                if (not any(target_filename)) or (
                        not (roi_params["suffix"] is None) and
                    (roi_filename is None)):
                    continue

                if not subject.has_metadata():
                    print("Subject without metadata.")
                    metadata = {}
                else:
                    metadata = subject.metadata()
                    # add contrast to metadata
                metadata['contrast'] = subject.record["modality"]

                if metadata_choice == 'mri_params':
                    if not all([
                            imed_film.check_isMRIparam(m, metadata)
                            for m in self.metadata.keys()
                    ]):
                        continue

                if len(bounding_box_dict):
                    # Take only one bounding box for cropping
                    metadata['bounding_box'] = bounding_box_dict[str(
                        subject.record["absolute_path"])][0]

                self.filename_pairs.append(
                    (subject.record["subject_id"],
                     [subject.record.absolute_path
                      ], target_filename, roi_filename, [metadata]))

                list_patients.append(subject.record["subject_id"])

        self.slice_axis = slice_axis
        self.slice_filter_fn = slice_filter_fn

        # Update HDF5 metadata
        self.hdf5_file.attrs.create('patients_id',
                                    list(set(list_patients)),
                                    dtype=self.dt)
        self.hdf5_file.attrs['slice_axis'] = slice_axis

        self.hdf5_file.attrs['slice_filter_fn'] = [
            ('filter_empty_input', True), ('filter_empty_mask', False)
        ]
        self.hdf5_file.attrs['metadata_choice'] = metadata_choice

        # Save images into HDF5 file
        self._load_filenames()
        print("Files loaded.")
Exemple #6
0
def get_new_subject_split(path_folder, center_test, split_method, random_seed,
                          train_frac, test_frac, log_directory, balance, subject_selection=None):
    """Randomly split dataset between training / validation / testing.

    Randomly split dataset between training / validation / testing\
        and save it in log_directory + "/split_datasets.joblib".

    Args:
        path_folder (string): Dataset folder.
        center_test (list): List of centers to include in the testing set.
        split_method (string): See imed_loader_utils.split_dataset.
        random_seed (int): Random seed.
        train_frac (float): Training dataset proportion, between 0 and 1.
        test_frac (float): Testing dataset proportionm between 0 and 1.
        log_directory (string): Output folder.
        balance (string): Metadata contained in "participants.tsv" file with categorical values. Each category will be
        evenly distributed in the training, validation and testing datasets.
        subject_selection (dict): Used to specify a custom subject selection from a dataset.

    Returns:
        list, list list: Training, validation and testing subjects lists.
    """
    # read participants.tsv as pandas dataframe
    df = bids.BIDS(path_folder).participants.content
    if subject_selection is not None:
        # Verify subject_selection format
        if not (len(subject_selection["metadata"]) == len(subject_selection["n"]) == len(subject_selection["value"])):
            raise ValueError("All lists in subject_selection parameter should have the same length.")

        sampled_dfs = []
        for m, n, v in zip(subject_selection["metadata"], subject_selection["n"], subject_selection["value"]):
            sampled_dfs.append(df[df[m] == v].sample(n=n, random_state=random_seed))

        if len(sampled_dfs) != 0:
            df = pd.concat(sampled_dfs)

    # If balance, then split the dataframe for each categorical value of the "balance" column
    if balance:
        if balance in df.keys():
            df_list = [df[df[balance] == k] for k in df[balance].unique().tolist()]
        else:
            logger.warning("No column named '{}' was found in 'participants.tsv' file. Not taken into account to split "
                           "the dataset.".format(balance))
            df_list = [df]
    else:
        df_list = [df]

    train_lst, valid_lst, test_lst = [], [], []
    for df_tmp in df_list:
        # Split dataset on each section of subjects
        train_tmp, valid_tmp, test_tmp = split_dataset(df=df_tmp,
                                                       center_test_lst=center_test,
                                                       split_method=split_method,
                                                       random_seed=random_seed,
                                                       train_frac=train_frac,
                                                       test_frac=test_frac)
        # Update the dataset lists
        train_lst += train_tmp
        valid_lst += valid_tmp
        test_lst += test_tmp

    # save the subject distribution
    split_dct = {'train': train_lst, 'valid': valid_lst, 'test': test_lst}
    split_path = os.path.join(log_directory, "split_datasets.joblib")
    joblib.dump(split_dct, split_path)

    return train_lst, valid_lst, test_lst
Exemple #7
0
def merge_bids_datasets(path_data):
    """Read the participants.tsv from several BIDS folders and merge them into a single dataframe.
    Args:
        path_data (list) or (str): BIDS folders paths

    Returns:
        df: dataframe with merged subjects and columns
    """
    path_data = imed_utils.format_path_data(path_data)

    if len(path_data) == 1:
        # read participants.tsv as pandas dataframe
        df = bids.BIDS(path_data[0]).participants.content
        # Append a new column to show which dataset the Subjects belong to (this will be used later for loading)
        df['path_output'] = [path_data[0]] * len(df)
    elif path_data == []:
        raise Exception("No dataset folder selected")
    else:
        # Merge multiple .tsv files into the same dataframe
        df = pd.read_table(os.path.join(path_data[0], 'participants.tsv'),
                           encoding="ISO-8859-1")
        # Convert to string to get rid of potential TypeError during merging within the same column
        df = df.astype(str)

        # Add the Bids_path to the dataframe
        df['path_output'] = [path_data[0]] * len(df)

        for iFolder in range(1, len(path_data)):
            df_next = pd.read_table(os.path.join(path_data[iFolder],
                                                 'participants.tsv'),
                                    encoding="ISO-8859-1")
            df_next = df_next.astype(str)
            df_next['path_output'] = [path_data[iFolder]] * len(df_next)
            # Merge the .tsv files (This keeps also non-overlapping fields)
            df = pd.merge(left=df, right=df_next, how='outer')

    # Get rid of duplicate entries based on the field "participant_id" (the same subject could have in theory be
    # included in both datasets). The assumption here is that if the two datasets contain the same subject,
    # identical sessions of the subjects are contained within the two folder so only the files within the first folder
    # will be kept.
    logical_keep_first_encounter = []
    indicesOfDuplicates = []
    used = set()  # For debugging

    for iEntry in range(len(df)):
        if df['participant_id'][iEntry] not in used:
            used.add(df['participant_id'][iEntry])  # For debugging
            logical_keep_first_encounter.append(iEntry)
        else:
            indicesOfDuplicates.append(iEntry)  # For debugging
    # Just keep the dataframe with unique participant_id
    df = df.iloc[logical_keep_first_encounter, :]

    # Rearrange the bids paths to be last column of the dataframe
    cols = list(df.columns.values)
    cols.remove("path_output")
    cols.append("path_output")
    df = df[cols]

    # Substitute NaNs with string: "-". This helps with metadata selection
    df = df.fillna("-")

    return df
# First load the used dataset list
subjectsUsedFile = '/home/nas/Desktop/dataset-training-sct.pkl'  # train_valid_test: 1 for training, 2 for validating, 3 for testing

# Output file
outputFile = '/home/nas/Consulting/ivado-project/Datasets/merged_SCTLARGE_MULTISUBJECT/split_datasets_converted.joblib'

dataUsedOnSct = pd.read_pickle(subjectsUsedFile)

subjectsUsedForTesting = dataUsedOnSct[
    dataUsedOnSct['train_valid_test'] == 3]['subject'].to_list(
    )  # THESE WILL FOR SURE BE USED IN THE TESTING SET, NOT IN THE OTHER TWO

# Load the merged participants.tsv
merged_folder = '/home/nas/Consulting/ivado-project/Datasets/merged_SCTLARGE_MULTISUBJECT/'
df_merged = bids.BIDS(merged_folder).participants.content

# NOW SHUFFLE AVAILABLE SUBJECTS AND MAKE SURE THERE ARE NO SUBJECTS FROM THE SCT_TESTING IN THE TRAINING AND VALIDATION LISTS
percentage_train = 0.6
percentage_validation = 0.2

# Whatever was used in sct testing, will stay in the testing side of the joblib as well
test = df_merged[np.in1d(df_merged['data_id'], subjectsUsedForTesting)]
# Keep only the rest of the subjects for splitting to training/validation/testing sets
df_merged_reduced = df_merged[np.invert(
    np.in1d(df_merged['data_id'], subjectsUsedForTesting))]

train, validate, test2 = np.split(df_merged_reduced.sample(frac=1), [
    int(percentage_train * (len(df_merged_reduced) + len(test) / 2)),
    int((percentage_train + percentage_validation) * len(df_merged_reduced) +
        len(test) / 2)