def get_new_subject_split(path_folder, center_test, split_method, random_seed, train_frac, test_frac, log_directory, balance): """Randomly split dataset between training / validation / testing. Randomly split dataset between training / validation / testing\ and save it in log_directory + "/split_datasets.joblib". Args: path_folder (string): Dataset folder. center_test (list): List of centers to include in the testing set. split_method (string): See imed_loader_utils.split_dataset. random_seed (int): Random seed. train_frac (float): Training dataset proportion, between 0 and 1. test_frac (float): Testing dataset proportionm between 0 and 1. log_directory (string): Output folder. balance (string): Metadata contained in "participants.tsv" file with categorical values. Each category will be evenly distributed in the training, validation and testing datasets. Returns: list, list list: Training, validation and testing subjects lists. """ # read participants.tsv as pandas dataframe df = bids.BIDS(path_folder).participants.content # If balance, then split the dataframe for each categorical value of the "balance" column if balance: if balance in df.keys(): df_list = [ df[df[balance] == k] for k in df[balance].unique().tolist() ] else: logger.warning( "No column named '{}' was found in 'participants.tsv' file. Not taken into account to split " "the dataset.".format(balance)) df_list = [df] else: df_list = [df] train_lst, valid_lst, test_lst = [], [], [] for df_tmp in df_list: # Split dataset on each section of subjects train_tmp, valid_tmp, test_tmp = split_dataset( df=df_tmp, center_test_lst=center_test, split_method=split_method, random_seed=random_seed, train_frac=train_frac, test_frac=test_frac) # Update the dataset lists train_lst += train_tmp valid_lst += valid_tmp test_lst += test_tmp # save the subject distribution split_dct = {'train': train_lst, 'valid': valid_lst, 'test': test_lst} split_path = os.path.join(log_directory, "split_datasets.joblib") joblib.dump(split_dct, split_path) return train_lst, valid_lst, test_lst
def run_command(context, n_gif=0, thr_increment=None, resume_training=False): """Run main command. This function is central in the ivadomed project as training / testing / evaluation commands are run via this function. All the process parameters are defined in the config. Args: context (dict): Dictionary containing all parameters that are needed for a given process. See :doc:`configuration_file` for more details. n_gif (int): Generates a GIF during training if larger than zero, one frame per epoch for a given slice. The parameter indicates the number of 2D slices used to generate GIFs, one GIF per slice. A GIF shows predictions of a given slice from the validation sub-dataset. They are saved within the log directory. thr_increment (float): A threshold analysis is performed at the end of the training using the trained model and the training + validation sub-dataset to find the optimal binarization threshold. The specified value indicates the increment between 0 and 1 used during the ROC analysis (e.g. 0.1). resume_training (bool): Load a saved model ("checkpoint.pth.tar" in the log_directory) for resume training. This training state is saved everytime a new best model is saved in the log directory. Returns: Float or pandas Dataframe: If "train" command: Returns floats: best loss score for both training and validation. If "test" command: Returns a pandas Dataframe: of metrics computed for each subject of the testing sub dataset and return the prediction metrics before evaluation. If "segment" command: No return value. """ command = copy.deepcopy(context["command"]) log_directory = copy.deepcopy(context["log_directory"]) if not os.path.isdir(log_directory): print('Creating log directory: {}'.format(log_directory)) os.makedirs(log_directory) else: print('Log directory already exists: {}'.format(log_directory)) # Define device cuda_available, device = imed_utils.define_device(context['gpu']) # Get subject lists train_lst, valid_lst, test_lst = imed_loader_utils.get_subdatasets_subjects_list( context["split_dataset"], context['loader_parameters']['bids_path'], log_directory) # Loader params loader_params = copy.deepcopy(context["loader_parameters"]) if command == "train": loader_params["contrast_params"]["contrast_lst"] = loader_params[ "contrast_params"]["training_validation"] else: loader_params["contrast_params"]["contrast_lst"] = loader_params[ "contrast_params"]["testing"] if "FiLMedUnet" in context and context["FiLMedUnet"]["applied"]: loader_params.update( {"metadata_type": context["FiLMedUnet"]["metadata"]}) # Get transforms for each subdataset transform_train_params, transform_valid_params, transform_test_params = \ imed_transforms.get_subdatasets_transforms(context["transformation"]) # MODEL PARAMETERS model_params = copy.deepcopy(context["default_model"]) model_params["folder_name"] = copy.deepcopy(context["model_name"]) model_context_list = [ model_name for model_name in MODEL_LIST if model_name in context and context[model_name]["applied"] ] if len(model_context_list) == 1: model_params["name"] = model_context_list[0] model_params.update(context[model_context_list[0]]) elif 'Modified3DUNet' in model_context_list and 'FiLMedUnet' in model_context_list and len( model_context_list) == 2: model_params["name"] = 'Modified3DUNet' for i in range(len(model_context_list)): model_params.update(context[model_context_list[i]]) elif len(model_context_list) > 1: print( 'ERROR: Several models are selected in the configuration file: {}.' 'Please select only one (i.e. only one where: "applied": true).'. format(model_context_list)) exit() model_params['is_2d'] = False if "Modified3DUNet" in model_params[ 'name'] else model_params['is_2d'] # Get in_channel from contrast_lst if loader_params["multichannel"]: model_params["in_channel"] = len( loader_params["contrast_params"]["contrast_lst"]) else: model_params["in_channel"] = 1 # Get out_channel from target_suffix model_params["out_channel"] = len(loader_params["target_suffix"]) # If multi-class output, then add background class if model_params["out_channel"] > 1: model_params.update({"out_channel": model_params["out_channel"] + 1}) # Display for spec' check imed_utils.display_selected_model_spec(params=model_params) # Update loader params if 'object_detection_params' in context: object_detection_params = context['object_detection_params'] object_detection_params.update({ "gpu": context['gpu'], "log_directory": context['log_directory'] }) loader_params.update( {"object_detection_params": object_detection_params}) loader_params.update({"model_params": model_params}) # TESTING PARAMS # Aleatoric uncertainty if context['uncertainty'][ 'aleatoric'] and context['uncertainty']['n_it'] > 0: transformation_dict = transform_train_params else: transformation_dict = transform_test_params undo_transforms = imed_transforms.UndoCompose( imed_transforms.Compose(transformation_dict, requires_undo=True)) testing_params = copy.deepcopy(context["training_parameters"]) testing_params.update({'uncertainty': context["uncertainty"]}) testing_params.update({ 'target_suffix': loader_params["target_suffix"], 'undo_transforms': undo_transforms, 'slice_axis': loader_params['slice_axis'] }) if command == "train": imed_utils.display_selected_transfoms(transform_train_params, dataset_type=["training"]) imed_utils.display_selected_transfoms(transform_valid_params, dataset_type=["validation"]) elif command == "test": imed_utils.display_selected_transfoms(transformation_dict, dataset_type=["testing"]) if command == 'train': # LOAD DATASET # Get Validation dataset ds_valid = imed_loader.load_dataset(**{ **loader_params, **{ 'data_list': valid_lst, 'transforms_params': transform_valid_params, 'dataset_type': 'validation' } }, device=device, cuda_available=cuda_available) # Get Training dataset ds_train = imed_loader.load_dataset(**{ **loader_params, **{ 'data_list': train_lst, 'transforms_params': transform_train_params, 'dataset_type': 'training' } }, device=device, cuda_available=cuda_available) metric_fns = imed_metrics.get_metric_fns(ds_train.task) # If FiLM, normalize data if 'film_layers' in model_params and any(model_params['film_layers']): # Normalize metadata before sending to the FiLM network results = imed_film.get_film_metadata_models( ds_train=ds_train, metadata_type=model_params['metadata'], debugging=context["debugging"]) ds_train, train_onehotencoder, metadata_clustering_models = results ds_valid = imed_film.normalize_metadata( ds_valid, metadata_clustering_models, context["debugging"], model_params['metadata']) model_params.update({ "film_onehotencoder": train_onehotencoder, "n_metadata": len([ll for l in train_onehotencoder.categories_ for ll in l]) }) joblib.dump(metadata_clustering_models, "./" + log_directory + "/clustering_models.joblib") joblib.dump(train_onehotencoder, "./" + log_directory + "/one_hot_encoder.joblib") # Model directory path_model = os.path.join(log_directory, context["model_name"]) if not os.path.isdir(path_model): print('Creating model directory: {}'.format(path_model)) os.makedirs(path_model) if 'film_layers' in model_params and any( model_params['film_layers']): joblib.dump(train_onehotencoder, os.path.join(path_model, "one_hot_encoder.joblib")) if 'metadata_dict' in ds_train[0]['input_metadata'][0]: metadata_dict = ds_train[0]['input_metadata'][0][ 'metadata_dict'] joblib.dump( metadata_dict, os.path.join(path_model, "metadata_dict.joblib")) else: print('Model directory already exists: {}'.format(path_model)) # RUN TRAINING best_training_dice, best_training_loss, best_validation_dice, best_validation_loss = imed_training.train( model_params=model_params, dataset_train=ds_train, dataset_val=ds_valid, training_params=context["training_parameters"], log_directory=log_directory, device=device, cuda_available=cuda_available, metric_fns=metric_fns, n_gif=n_gif, resume_training=resume_training, debugging=context["debugging"]) if thr_increment: # LOAD DATASET if command != 'train': # If command == train, then ds_valid already load # Get Validation dataset ds_valid = imed_loader.load_dataset(**{ **loader_params, **{ 'data_list': valid_lst, 'transforms_params': transform_valid_params, 'dataset_type': 'validation' } }, device=device, cuda_available=cuda_available) # Get Training dataset with no Data Augmentation ds_train = imed_loader.load_dataset(**{ **loader_params, **{ 'data_list': train_lst, 'transforms_params': transform_valid_params, 'dataset_type': 'training' } }, device=device, cuda_available=cuda_available) # Choice of optimisation metric metric = "recall_specificity" if model_params[ "name"] in imed_utils.CLASSIFIER_LIST else "dice" # Model path model_path = os.path.join(log_directory, "best_model.pt") # Run analysis thr = imed_testing.threshold_analysis(model_path=model_path, ds_lst=[ds_train, ds_valid], model_params=model_params, testing_params=testing_params, metric=metric, increment=thr_increment, fname_out=os.path.join( log_directory, "roc.png"), cuda_available=cuda_available) # Update threshold in config file context["postprocessing"]["binarize_prediction"] = {"thr": thr} if command == 'train': # Save config file within log_directory and log_directory/model_name # Done after the threshold_analysis to propate this info in the config files with open(os.path.join(log_directory, "config_file.json"), 'w') as fp: json.dump(context, fp, indent=4) with open( os.path.join(log_directory, context["model_name"], context["model_name"] + ".json"), 'w') as fp: json.dump(context, fp, indent=4) return best_training_dice, best_training_loss, best_validation_dice, best_validation_loss if command == 'test': # LOAD DATASET ds_test = imed_loader.load_dataset(**{ **loader_params, **{ 'data_list': test_lst, 'transforms_params': transformation_dict, 'dataset_type': 'testing', 'requires_undo': True } }, device=device, cuda_available=cuda_available) metric_fns = imed_metrics.get_metric_fns(ds_test.task) if 'film_layers' in model_params and any(model_params['film_layers']): clustering_path = os.path.join(log_directory, "clustering_models.joblib") metadata_clustering_models = joblib.load(clustering_path) ohe_path = os.path.join(log_directory, "one_hot_encoder.joblib") one_hot_encoder = joblib.load(ohe_path) ds_test = imed_film.normalize_metadata(ds_test, metadata_clustering_models, context["debugging"], model_params['metadata']) model_params.update({ "film_onehotencoder": one_hot_encoder, "n_metadata": len([ll for l in one_hot_encoder.categories_ for ll in l]) }) # RUN INFERENCE pred_metrics = imed_testing.test( model_params=model_params, dataset_test=ds_test, testing_params=testing_params, log_directory=log_directory, device=device, cuda_available=cuda_available, metric_fns=metric_fns, postprocessing=context['postprocessing']) # RUN EVALUATION df_results = imed_evaluation.evaluate( bids_path=loader_params['bids_path'], log_directory=log_directory, target_suffix=loader_params["target_suffix"], eval_params=context["evaluation_parameters"]) return df_results, pred_metrics if command == 'segment': bids_ds = bids.BIDS(context["loader_parameters"]["bids_path"]) df = bids_ds.participants.content subj_lst = df['participant_id'].tolist() bids_subjects = [ s for s in bids_ds.get_subjects() if s.record["subject_id"] in subj_lst ] # Add postprocessing to packaged model path_model = os.path.join(context['log_directory'], context['model_name']) path_model_config = os.path.join(path_model, context['model_name'] + ".json") model_config = imed_config_manager.load_json(path_model_config) model_config['postprocessing'] = context['postprocessing'] with open(path_model_config, 'w') as fp: json.dump(model_config, fp, indent=4) options = None for subject in bids_subjects: fname_img = subject.record["absolute_path"] if 'film_layers' in model_params and any( model_params['film_layers']) and model_params['metadata']: subj_id = subject.record['subject_id'] metadata = df[df['participant_id'] == subj_id][ model_params['metadata']].values[0] options = {'metadata': metadata} pred = imed_inference.segment_volume(path_model, fname_image=fname_img, gpu_number=context['gpu'], options=options) pred_path = os.path.join(context['log_directory'], "pred_masks") if not os.path.exists(pred_path): os.makedirs(pred_path) filename = subject.record['subject_id'] + "_" + subject.record[ 'modality'] + "_pred" + ".nii.gz" nib.save(pred, os.path.join(pred_path, filename))
def run_segment_command(context, model_params): bids_ds = [] path_data = imed_utils.format_path_data( context["loader_parameters"]["path_data"]) for bids_folder in path_data: bids_ds.append(bids.BIDS(bids_folder)) # Get the merged df from all dataset paths df = imed_loader_utils.merge_bids_datasets(path_data) subj_lst = df['participant_id'].tolist() # Append subjects from all BIDSdatasets into a list bids_subjects = [] for i_bids_folder in range(0, len(path_data)): bids_subjects += [ s for s in bids_ds[i_bids_folder].get_subjects() if s.record["subject_id"] in subj_lst ] # Add postprocessing to packaged model path_model = os.path.join(context['path_output'], context['model_name']) path_model_config = os.path.join(path_model, context['model_name'] + ".json") model_config = imed_config_manager.load_json(path_model_config) model_config['postprocessing'] = context['postprocessing'] with open(path_model_config, 'w') as fp: json.dump(model_config, fp, indent=4) options = None for subject in bids_subjects: if context['loader_parameters']['multichannel']: fname_img = [] provided_contrasts = [] contrasts = context['loader_parameters']['contrast_params'][ 'testing'] # Keep contrast order for c in contrasts: for s in bids_subjects: if subject.record['subject_id'] == s.record[ 'subject_id'] and s.record['modality'] == c: provided_contrasts.append(c) fname_img.append(s.record['absolute_path']) bids_subjects.remove(s) if len(fname_img) != len(contrasts): logger.warning( "Missing contrast for subject {}. {} were provided but {} are required. Skipping " "subject.".format(subject.record['subject_id'], provided_contrasts, contrasts)) continue else: fname_img = [subject.record['absolute_path']] if 'film_layers' in model_params and any( model_params['film_layers']) and model_params['metadata']: subj_id = subject.record['subject_id'] metadata = df[df['participant_id'] == subj_id][ model_params['metadata']].values[0] options = {'metadata': metadata} pred_list, target_list = imed_inference.segment_volume( path_model, fname_images=fname_img, gpu_id=context['gpu_ids'][0], options=options) pred_path = os.path.join(context['path_output'], "pred_masks") if not os.path.exists(pred_path): os.makedirs(pred_path) for pred, target in zip(pred_list, target_list): filename = subject.record['subject_id'] + "_" + subject.record['modality'] + target + "_pred" + \ ".nii.gz" nib.save(pred, os.path.join(pred_path, filename))
def __init__(self, root_dir, subject_lst, target_suffix, contrast_params, slice_axis=2, cache=True, transform=None, metadata_choice=False, slice_filter_fn=None, roi_params=None, multichannel=False, object_detection_params=None, task="segmentation", soft_gt=False): self.bids_ds = bids.BIDS(root_dir) self.roi_params = roi_params if roi_params is not None else { "suffix": None, "slice_filter_roi": None } self.soft_gt = soft_gt self.filename_pairs = [] if metadata_choice == 'mri_params': self.metadata = { "FlipAngle": [], "RepetitionTime": [], "EchoTime": [], "Manufacturer": [] } bids_subjects = [ s for s in self.bids_ds.get_subjects() if s.record["subject_id"] in subject_lst ] # Create a list with the filenames for all contrasts and subjects subjects_tot = [] for subject in bids_subjects: subjects_tot.append(str(subject.record["absolute_path"])) # Create a dictionary with the number of subjects for each contrast of contrast_balance tot = { contrast: len([s for s in bids_subjects if s.record["modality"] == contrast]) for contrast in contrast_params["balance"].keys() } # Create a counter that helps to balance the contrasts c = {contrast: 0 for contrast in contrast_params["balance"].keys()} multichannel_subjects = {} if multichannel: num_contrast = len(contrast_params["contrast_lst"]) idx_dict = {} for idx, contrast in enumerate(contrast_params["contrast_lst"]): idx_dict[contrast] = idx multichannel_subjects = { subject: { "absolute_paths": [None] * num_contrast, "deriv_path": None, "roi_filename": None, "metadata": [None] * num_contrast } for subject in subject_lst } bounding_box_dict = imed_obj_detect.load_bounding_boxes( object_detection_params, self.bids_ds.get_subjects(), slice_axis, contrast_params["contrast_lst"]) for subject in tqdm(bids_subjects, desc="Loading dataset"): if subject.record["modality"] in contrast_params["contrast_lst"]: # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance if subject.record["modality"] in contrast_params[ "balance"].keys(): c[subject. record["modality"]] = c[subject.record["modality"]] + 1 if c[subject.record["modality"]] / tot[subject.record[ "modality"]] > contrast_params["balance"][ subject.record["modality"]]: continue if not subject.has_derivative("labels"): print("Subject without derivative, skipping.") continue derivatives = subject.get_derivatives("labels") target_filename, roi_filename = [None ] * len(target_suffix), None for deriv in derivatives: for idx, suffix in enumerate(target_suffix): if deriv.endswith(subject.record["modality"] + suffix + ".nii.gz"): target_filename[idx] = deriv if not (self.roi_params["suffix"] is None) and \ deriv.endswith(subject.record["modality"] + self.roi_params["suffix"] + ".nii.gz"): roi_filename = [deriv] if (not any(target_filename)) or ( not (self.roi_params["suffix"] is None) and (roi_filename is None)): continue if not subject.has_metadata(): metadata = {} else: metadata = subject.metadata() # add contrast to metadata metadata['contrast'] = subject.record["modality"] if len(bounding_box_dict): # Take only one bounding box for cropping metadata['bounding_box'] = bounding_box_dict[str( subject.record["absolute_path"])][0] if metadata_choice == 'mri_params': if not all([ imed_film.check_isMRIparam(m, metadata, subject, self.metadata) for m in self.metadata.keys() ]): continue elif metadata_choice and metadata_choice != 'contrasts' and metadata_choice is not None: # add custom data to metadata subject_id = subject.record["subject_id"] df = bids.BIDS(root_dir).participants.content if metadata_choice not in df.columns: raise ValueError( "The following metadata cannot be found in participants.tsv file: {}. " "Invalid metadata choice.".format(metadata_choice)) metadata[metadata_choice] = df[ df['participant_id'] == subject_id][metadata_choice].values[0] # Create metadata dict for OHE data_lst = sorted(set(df[metadata_choice].values)) metadata_dict = {} for idx, data in enumerate(data_lst): metadata_dict[data] = idx metadata['metadata_dict'] = metadata_dict # Fill multichannel dictionary if multichannel: idx = idx_dict[subject.record["modality"]] subj_id = subject.record["subject_id"] multichannel_subjects[subj_id]["absolute_paths"][ idx] = subject.record.absolute_path multichannel_subjects[subj_id][ "deriv_path"] = target_filename multichannel_subjects[subj_id]["metadata"][idx] = metadata if roi_filename: multichannel_subjects[subj_id][ "roi_filename"] = roi_filename else: self.filename_pairs.append( ([subject.record.absolute_path], target_filename, roi_filename, [metadata])) if multichannel: for subject in multichannel_subjects.values(): if None not in subject["absolute_paths"]: self.filename_pairs.append( (subject["absolute_paths"], subject["deriv_path"], subject["roi_filename"], subject["metadata"])) super().__init__(self.filename_pairs, slice_axis, cache, transform, slice_filter_fn, task, self.roi_params, self.soft_gt)
def __init__(self, root_dir, subject_lst, target_suffix, contrast_lst, hdf5_name, contrast_balance=None, slice_axis=2, metadata_choice=False, slice_filter_fn=None, roi_params=None, transform=None, object_detection_params=None, soft_gt=False): print("Starting conversion") # Getting all patients id self.bids_ds = bids.BIDS(root_dir) bids_subjects = [ s for s in self.bids_ds.get_subjects() if s.record["subject_id"] in subject_lst ] self.soft_gt = soft_gt self.dt = h5py.special_dtype(vlen=str) # opening an hdf5 file with write access and writing metadata self.hdf5_file = h5py.File(hdf5_name, "w") list_patients = [] self.filename_pairs = [] if metadata_choice == 'mri_params': self.metadata = { "FlipAngle": [], "RepetitionTime": [], "EchoTime": [], "Manufacturer": [] } self.prepro_transforms, self.transform = transform # Create a list with the filenames for all contrasts and subjects subjects_tot = [] for subject in bids_subjects: subjects_tot.append(str(subject.record["absolute_path"])) # Create a dictionary with the number of subjects for each contrast of contrast_balance tot = { contrast: len([s for s in bids_subjects if s.record["modality"] == contrast]) for contrast in contrast_balance.keys() } # Create a counter that helps to balance the contrasts c = {contrast: 0 for contrast in contrast_balance.keys()} self.has_bounding_box = True bounding_box_dict = imed_obj_detect.load_bounding_boxes( object_detection_params, self.bids_ds.get_subjects(), slice_axis, contrast_lst) for subject in tqdm(bids_subjects, desc="Loading dataset"): if subject.record["modality"] in contrast_lst: # Training & Validation: do not consider the contrasts over the threshold contained in contrast_balance if subject.record["modality"] in contrast_balance.keys(): c[subject. record["modality"]] = c[subject.record["modality"]] + 1 if c[subject.record["modality"]] / tot[subject.record["modality"]] \ > contrast_balance[subject.record["modality"]]: continue if not subject.has_derivative("labels"): print("Subject without derivative, skipping.") continue derivatives = subject.get_derivatives("labels") target_filename, roi_filename = [None ] * len(target_suffix), None for deriv in derivatives: for idx, suffix in enumerate(target_suffix): if deriv.endswith(subject.record["modality"] + suffix + ".nii.gz"): target_filename[idx] = deriv if not (roi_params["suffix"] is None) and \ deriv.endswith(subject.record["modality"] + roi_params["suffix"] + ".nii.gz"): roi_filename = [deriv] if (not any(target_filename)) or ( not (roi_params["suffix"] is None) and (roi_filename is None)): continue if not subject.has_metadata(): print("Subject without metadata.") metadata = {} else: metadata = subject.metadata() # add contrast to metadata metadata['contrast'] = subject.record["modality"] if metadata_choice == 'mri_params': if not all([ imed_film.check_isMRIparam(m, metadata) for m in self.metadata.keys() ]): continue if len(bounding_box_dict): # Take only one bounding box for cropping metadata['bounding_box'] = bounding_box_dict[str( subject.record["absolute_path"])][0] self.filename_pairs.append( (subject.record["subject_id"], [subject.record.absolute_path ], target_filename, roi_filename, [metadata])) list_patients.append(subject.record["subject_id"]) self.slice_axis = slice_axis self.slice_filter_fn = slice_filter_fn # Update HDF5 metadata self.hdf5_file.attrs.create('patients_id', list(set(list_patients)), dtype=self.dt) self.hdf5_file.attrs['slice_axis'] = slice_axis self.hdf5_file.attrs['slice_filter_fn'] = [ ('filter_empty_input', True), ('filter_empty_mask', False) ] self.hdf5_file.attrs['metadata_choice'] = metadata_choice # Save images into HDF5 file self._load_filenames() print("Files loaded.")
def get_new_subject_split(path_folder, center_test, split_method, random_seed, train_frac, test_frac, log_directory, balance, subject_selection=None): """Randomly split dataset between training / validation / testing. Randomly split dataset between training / validation / testing\ and save it in log_directory + "/split_datasets.joblib". Args: path_folder (string): Dataset folder. center_test (list): List of centers to include in the testing set. split_method (string): See imed_loader_utils.split_dataset. random_seed (int): Random seed. train_frac (float): Training dataset proportion, between 0 and 1. test_frac (float): Testing dataset proportionm between 0 and 1. log_directory (string): Output folder. balance (string): Metadata contained in "participants.tsv" file with categorical values. Each category will be evenly distributed in the training, validation and testing datasets. subject_selection (dict): Used to specify a custom subject selection from a dataset. Returns: list, list list: Training, validation and testing subjects lists. """ # read participants.tsv as pandas dataframe df = bids.BIDS(path_folder).participants.content if subject_selection is not None: # Verify subject_selection format if not (len(subject_selection["metadata"]) == len(subject_selection["n"]) == len(subject_selection["value"])): raise ValueError("All lists in subject_selection parameter should have the same length.") sampled_dfs = [] for m, n, v in zip(subject_selection["metadata"], subject_selection["n"], subject_selection["value"]): sampled_dfs.append(df[df[m] == v].sample(n=n, random_state=random_seed)) if len(sampled_dfs) != 0: df = pd.concat(sampled_dfs) # If balance, then split the dataframe for each categorical value of the "balance" column if balance: if balance in df.keys(): df_list = [df[df[balance] == k] for k in df[balance].unique().tolist()] else: logger.warning("No column named '{}' was found in 'participants.tsv' file. Not taken into account to split " "the dataset.".format(balance)) df_list = [df] else: df_list = [df] train_lst, valid_lst, test_lst = [], [], [] for df_tmp in df_list: # Split dataset on each section of subjects train_tmp, valid_tmp, test_tmp = split_dataset(df=df_tmp, center_test_lst=center_test, split_method=split_method, random_seed=random_seed, train_frac=train_frac, test_frac=test_frac) # Update the dataset lists train_lst += train_tmp valid_lst += valid_tmp test_lst += test_tmp # save the subject distribution split_dct = {'train': train_lst, 'valid': valid_lst, 'test': test_lst} split_path = os.path.join(log_directory, "split_datasets.joblib") joblib.dump(split_dct, split_path) return train_lst, valid_lst, test_lst
def merge_bids_datasets(path_data): """Read the participants.tsv from several BIDS folders and merge them into a single dataframe. Args: path_data (list) or (str): BIDS folders paths Returns: df: dataframe with merged subjects and columns """ path_data = imed_utils.format_path_data(path_data) if len(path_data) == 1: # read participants.tsv as pandas dataframe df = bids.BIDS(path_data[0]).participants.content # Append a new column to show which dataset the Subjects belong to (this will be used later for loading) df['path_output'] = [path_data[0]] * len(df) elif path_data == []: raise Exception("No dataset folder selected") else: # Merge multiple .tsv files into the same dataframe df = pd.read_table(os.path.join(path_data[0], 'participants.tsv'), encoding="ISO-8859-1") # Convert to string to get rid of potential TypeError during merging within the same column df = df.astype(str) # Add the Bids_path to the dataframe df['path_output'] = [path_data[0]] * len(df) for iFolder in range(1, len(path_data)): df_next = pd.read_table(os.path.join(path_data[iFolder], 'participants.tsv'), encoding="ISO-8859-1") df_next = df_next.astype(str) df_next['path_output'] = [path_data[iFolder]] * len(df_next) # Merge the .tsv files (This keeps also non-overlapping fields) df = pd.merge(left=df, right=df_next, how='outer') # Get rid of duplicate entries based on the field "participant_id" (the same subject could have in theory be # included in both datasets). The assumption here is that if the two datasets contain the same subject, # identical sessions of the subjects are contained within the two folder so only the files within the first folder # will be kept. logical_keep_first_encounter = [] indicesOfDuplicates = [] used = set() # For debugging for iEntry in range(len(df)): if df['participant_id'][iEntry] not in used: used.add(df['participant_id'][iEntry]) # For debugging logical_keep_first_encounter.append(iEntry) else: indicesOfDuplicates.append(iEntry) # For debugging # Just keep the dataframe with unique participant_id df = df.iloc[logical_keep_first_encounter, :] # Rearrange the bids paths to be last column of the dataframe cols = list(df.columns.values) cols.remove("path_output") cols.append("path_output") df = df[cols] # Substitute NaNs with string: "-". This helps with metadata selection df = df.fillna("-") return df
# First load the used dataset list subjectsUsedFile = '/home/nas/Desktop/dataset-training-sct.pkl' # train_valid_test: 1 for training, 2 for validating, 3 for testing # Output file outputFile = '/home/nas/Consulting/ivado-project/Datasets/merged_SCTLARGE_MULTISUBJECT/split_datasets_converted.joblib' dataUsedOnSct = pd.read_pickle(subjectsUsedFile) subjectsUsedForTesting = dataUsedOnSct[ dataUsedOnSct['train_valid_test'] == 3]['subject'].to_list( ) # THESE WILL FOR SURE BE USED IN THE TESTING SET, NOT IN THE OTHER TWO # Load the merged participants.tsv merged_folder = '/home/nas/Consulting/ivado-project/Datasets/merged_SCTLARGE_MULTISUBJECT/' df_merged = bids.BIDS(merged_folder).participants.content # NOW SHUFFLE AVAILABLE SUBJECTS AND MAKE SURE THERE ARE NO SUBJECTS FROM THE SCT_TESTING IN THE TRAINING AND VALIDATION LISTS percentage_train = 0.6 percentage_validation = 0.2 # Whatever was used in sct testing, will stay in the testing side of the joblib as well test = df_merged[np.in1d(df_merged['data_id'], subjectsUsedForTesting)] # Keep only the rest of the subjects for splitting to training/validation/testing sets df_merged_reduced = df_merged[np.invert( np.in1d(df_merged['data_id'], subjectsUsedForTesting))] train, validate, test2 = np.split(df_merged_reduced.sample(frac=1), [ int(percentage_train * (len(df_merged_reduced) + len(test) / 2)), int((percentage_train + percentage_validation) * len(df_merged_reduced) + len(test) / 2)