def write_splits( diagnosis: str, diagnosis_df: pd.DataFrame, split_label: str, n_splits: int, train_path: str, test_path: str, supplementary_diagnoses: List[str] = None, ) -> None: """ Split data at the subject-level in training and test to have equivalent distributions in split_label. Writes test and train Dataframes. Args: diagnosis: diagnosis on which the split is done diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis'] split_label: label on which the split is done (categorical variables) n_splits: Number of splits in the k-fold cross-validation. train_path: Path to the training data. test_path: Path to the test data. supplementary_diagnoses: List of supplementary diagnoses to add to the data. """ baseline_df = extract_baseline(diagnosis_df) if split_label is None: diagnoses_list = list(baseline_df.diagnosis) unique = list(set(diagnoses_list)) y = np.array([unique.index(x) for x in diagnoses_list]) else: stratification_list = list(baseline_df[split_label]) unique = list(set(stratification_list)) y = np.array([unique.index(x) for x in stratification_list]) splits = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2) print(f"Label {diagnosis}") for i, indices in enumerate(splits.split(np.zeros(len(y)), y)): print(f"Split {i}") train_index, test_index = indices test_df = baseline_df.iloc[test_index] train_df = baseline_df.iloc[train_index] if supplementary_diagnoses is not None: for supplementary_diagnosis in supplementary_diagnoses: sup_train_df = pd.read_csv( path.join( train_path, f"split-{i}", f"{supplementary_diagnosis}_baseline.tsv", ), sep="\t", ) train_df = pd.concat([train_df, sup_train_df]) sup_test_df = pd.read_csv( path.join( test_path, f"split-{i}", f"{supplementary_diagnosis}_baseline.tsv", ), sep="\t", ) test_df = pd.concat([test_df, sup_test_df]) train_df.reset_index(inplace=True, drop=True) test_df.reset_index(inplace=True, drop=True) train_df.to_csv( path.join(train_path, f"split-{i}", f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) test_df.to_csv( path.join(test_path, f"split-{i}", f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv( path.join(train_path, f"split-{i}", f"{diagnosis}.tsv"), sep="\t", index=False, )
def create_split( diagnosis, diagnosis_df, split_label, n_test, p_age_threshold=0.80, p_sex_threshold=0.80, supplementary_train_df=None, ignore_demographics=False, ): """ Split data at the subject-level in training and test set with equivalent age, sex and split_label distributions Args: diagnosis: (str) diagnosis on which the split is done diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis'] split_label: (str) label on which the split is done (categorical variables) n_test: (float) If > 1 number of subjects to put in the test set. If < 1 proportion of subjects to put in the test set. p_age_threshold: (float) threshold for the t-test on age. p_sex_threshold: (float) threshold for the chi2 test on sex. supplementary_train_df: (DataFrame) Add data that must be included in the train set. ignore_demographics: (bool): If True the diagnoses are split without taking into account the demographics distributions (age, sex). Returns: train_df (DataFrame) subjects in the train set test_df (DataFrame) subjects in the test set """ if supplementary_train_df is not None: sup_train_sex = [ sex_dict[x] for x in supplementary_train_df.sex.values ] sup_train_age = [float(x) for x in supplementary_train_df.age.values] else: sup_train_sex = [] sup_train_age = [] baseline_df = extract_baseline(diagnosis_df) if n_test >= 1: n_test = int(n_test) else: n_test = int(n_test * len(baseline_df)) if not {split_label}.issubset(set(baseline_df.columns.values)): raise ClinicaDLArgumentError( f"The column {split_label} is missing." f"Please add it using the --variables_of_interest flag in getlabels." ) if not ignore_demographics: try: sex_label = find_label(baseline_df.columns.values, "sex") age_label = find_label(baseline_df.columns.values, "age") except ClinicaDLArgumentError: raise ClinicaDLArgumentError( "This dataset do not have age or sex values. " "Please add the flag --ignore_demographics to split " "without trying to balance age or sex distributions.") sex = list(baseline_df[sex_label].values) age = list(baseline_df[age_label].values) category = list(baseline_df[split_label].values) category = category_conversion(category) category = remove_unicity(category) flag_selection = True n_try = 0 while flag_selection: splits = StratifiedShuffleSplit(n_splits=1, test_size=n_test) for train_index, test_index in splits.split(category, category): # Find the value for different demographics (age & sex) if len(set(age)) != 1: age_test = [float(age[idx]) for idx in test_index] age_train = [float(age[idx]) for idx in train_index] + sup_train_age _, p_age = ttest_ind(age_test, age_train, nan_policy="omit") else: p_age = 1 if len(set(sex)) != 1: sex_test = [sex_dict[sex[idx]] for idx in test_index] sex_train = [sex_dict[sex[idx]] for idx in train_index] + sup_train_sex _, p_sex = chi2(sex_test, sex_train) else: p_sex = 1 logger.debug(f"p_age={p_age:.2f}, p_sex={p_sex:.4f}") if p_sex >= p_sex_threshold and p_age >= p_age_threshold: flag_selection = False test_df = baseline_df.loc[test_index] train_df = baseline_df.loc[train_index] if supplementary_train_df is not None: train_df = pd.concat( [train_df, supplementary_train_df]) train_df.reset_index(drop=True, inplace=True) n_try += 1 logger.info( f"Split for diagnosis {diagnosis} was found after {n_try} trials.") else: idx = np.arange(len(baseline_df)) idx_test = np.random.choice(idx, size=n_test, replace=False) idx_test.sort() idx_train = complementary_list(idx, idx_test) test_df = baseline_df.loc[idx_test] train_df = baseline_df.loc[idx_train] return train_df, test_df
def generate_trivial_dataset( caps_directory: str, output_dir: str, n_subjects: int, tsv_path: Optional[str] = None, preprocessing: str = "t1-linear", mask_path: Optional[str] = None, atrophy_percent: float = 60, multi_cohort: bool = False, uncropped_image: bool = False, acq_label: str = "fdg", suvr_reference_region: str = "pons", ): """ Generates a fully separable dataset. Generates a dataset, based on the images of the CAPS directory, where a half of the image is processed using a mask to occlude a specific region. This procedure creates a dataset fully separable (images with half-right processed and image with half-left processed) Args: caps_directory: path to the CAPS directory. output_dir: folder containing the synthetic dataset in CAPS format. n_subjects: number of subjects in each class of the synthetic dataset. tsv_path: path to tsv file of list of subjects/sessions. preprocessing: preprocessing performed. Must be in ['linear', 'extensive']. mask_path: path to the extracted masks to generate the two labels. atrophy_percent: percentage of atrophy applied. multi_cohort: If True caps_directory is the path to a TSV file linking cohort names and paths. uncropped_image: If True the uncropped image of `t1-linear` or `pet-linear` will be used. acq_label: name of the tracer when using `pet-linear` preprocessing. suvr_reference_region: name of the reference region when using `pet-linear` preprocessing. Returns: Folder structure where images are stored in CAPS format. Raises: ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`. """ from pathlib import Path commandline_to_json( { "output_dir": output_dir, "caps_dir": caps_directory, "preprocessing": preprocessing, "n_subjects": n_subjects, "atrophy_percent": atrophy_percent, } ) # Transform caps_directory in dict caps_dict = CapsDataset.create_caps_dict(caps_directory, multi_cohort=multi_cohort) # Read DataFrame data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir) data_df = extract_baseline(data_df) home = str(Path.home()) cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks") url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/" FILE1 = RemoteFileStructure( filename="AAL2.tar.gz", url=url_aramis, checksum="89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471", ) makedirs(cache_clinicadl, exist_ok=True) if n_subjects > len(data_df): raise ValueError( f"The number of subjects {n_subjects} cannot be higher " f"than the number of subjects in the baseline dataset of size {len(data_df)}" ) if mask_path is None: if not exists(join(cache_clinicadl, "AAL2")): try: print("Try to download AAL2 masks") mask_path_tar = fetch_file(FILE1, cache_clinicadl) tar_file = tarfile.open(mask_path_tar) print("File: " + mask_path_tar) try: tar_file.extractall(cache_clinicadl) tar_file.close() mask_path = join(cache_clinicadl, "AAL2") except RuntimeError: print("Unable to extract downloaded files") except IOError as err: print("Unable to download required templates:", err) raise ValueError( """Unable to download masks, please download them manually at https://aramislab.paris.inria.fr/files/data/masks/ and provide a valid path.""" ) else: mask_path = join(cache_clinicadl, "AAL2") # Create subjects dir makedirs(join(output_dir, "subjects"), exist_ok=True) # Output tsv file columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"] output_df = pd.DataFrame(columns=columns) diagnosis_list = ["AD", "CN"] # Find appropriate preprocessing file type file_type = find_file_type( preprocessing, uncropped_image, acq_label, suvr_reference_region ) for i in range(2 * n_subjects): data_idx = i // 2 label = i % 2 participant_id = data_df.loc[data_idx, "participant_id"] session_id = data_df.loc[data_idx, "session_id"] cohort = data_df.loc[data_idx, "cohort"] image_paths = clinica_file_reader( [participant_id], [session_id], caps_dict[cohort], file_type ) image_nii = nib.load(image_paths[0]) image = image_nii.get_data() input_filename = basename(image_paths[0]) filename_pattern = "_".join(input_filename.split("_")[2::]) trivial_image_nii_dir = join( output_dir, "subjects", f"sub-TRIV{i}", session_id, preprocessing ) trivial_image_nii_filename = f"sub-TRIV{i}_{session_id}_{filename_pattern}" makedirs(trivial_image_nii_dir, exist_ok=True) atlas_to_mask = nib.load(join(mask_path, f"mask-{label + 1}.nii")).get_data() # Create atrophied image trivial_image = im_loss_roi_gaussian_distribution( image, atlas_to_mask, atrophy_percent ) trivial_image_nii = nib.Nifti1Image(trivial_image, affine=image_nii.affine) trivial_image_nii.to_filename( join(trivial_image_nii_dir, trivial_image_nii_filename) ) print(join(trivial_image_nii_dir, trivial_image_nii_filename)) # Append row to output tsv row = [f"sub-TRIV{i}", session_id, diagnosis_list[label], 60, "F"] row_df = pd.DataFrame([row], columns=columns) output_df = output_df.append(row_df) output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False) write_missing_mods(output_dir, output_df)
def split_diagnoses( formatted_data_path, n_test=100, subset_name="test", MCI_sub_categories=True, p_age_threshold=0.80, p_sex_threshold=0.80, categorical_split_variable=None, ignore_demographics=False, verbose=0, ): """ Performs a single split for each label independently on the subject level. The train folder will contain two lists per diagnosis (baseline and longitudinal), whereas the test folder will only include the list of baseline sessions. The age and sex distributions between the two sets must be non-significant (according to T-test and chi-square). Args: formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels. n_test (float): If >= 1, number of subjects to put in set with name 'subset_name'. If < 1, proportion of subjects to put in set with name 'subset_name'. If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name'. subset_name (str): Name of the subset that is complementary to train. MCI_sub_categories (bool): If True, manages MCI sub-categories to avoid data leakage. p_age_threshold (float): The threshold used for the T-test on age distributions. p_sex_threshold (float): The threshold used for the T-test on sex distributions. categorical_split_variable (str): name of a categorical variable to perform a stratified split. ignore_demographics (bool): If True the diagnoses are split without taking into account the demographics distributions (age, sex). verbose (int): level of verbosity. Returns: writes three files per <label>.tsv file present in formatted_data_path: - formatted_data_path/train/<label>.tsv - formatted_data_path/train/<label>_baseline.tsv - formatted_data_path/<subset_name>/<label>_baseline.tsv """ commandline_to_json( { "output_dir": formatted_data_path, "n_test": n_test, "subset_name": subset_name, "MCI_sub_categories": MCI_sub_categories, "p_age_threshold": p_age_threshold, "p_sex_threshold": p_sex_threshold, "categorical_split_variable": categorical_split_variable, "ignore_demographics": ignore_demographics, }, filename="split.json", ) # Read files results_path = formatted_data_path train_path = path.join(results_path, "train") if path.exists(train_path): shutil.rmtree(train_path) if n_test > 0: os.makedirs(train_path) if categorical_split_variable is None: categorical_split_variable = "diagnosis" test_path = path.join(results_path, subset_name) if path.exists(test_path): shutil.rmtree(test_path) os.makedirs(test_path) diagnosis_df_paths = os.listdir(results_path) diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")] diagnosis_df_paths = [ x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv") ] MCI_special_treatment = False if "MCI.tsv" in diagnosis_df_paths and n_test > 0: if MCI_sub_categories: diagnosis_df_paths.remove("MCI.tsv") MCI_special_treatment = True elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths: logger.warning( "MCI special treatment was deactivated though MCI subgroups were found." "Be aware that it may cause data leakage in transfer learning tasks." ) # The baseline session must be kept before or we are taking all the sessions to mix them for diagnosis_df_path in diagnosis_df_paths: diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path), sep="\t") interest_columns = diagnosis_df.columns.values diagnosis = diagnosis_df_path.split(".")[0] logger.info(f"Running split for diagnosis {diagnosis}") if n_test > 0: train_df, test_df = create_split( diagnosis, diagnosis_df, categorical_split_variable, n_test=n_test, p_age_threshold=p_age_threshold, p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, ) # Save baseline splits train_df.to_csv( path.join(train_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv(path.join(train_path, f"{diagnosis}.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False) else: baseline_df = extract_baseline(diagnosis_df) test_df = baseline_df[interest_columns] test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False) if MCI_special_treatment: # Extraction of MCI subjects without intersection with the sMCI / pMCI train diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"), sep="\t") MCI_df = diagnosis_df.set_index(["participant_id", "session_id"]) baseline_df = extract_baseline(MCI_df, set_index=False) if n_test > 1: n_test = int(n_test) else: n_test = int(n_test * len(baseline_df)) MCI_df, supplementary_diagnoses = remove_sub_labels( MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path) if len(supplementary_diagnoses) == 0: raise ClinicaDLArgumentError( "The MCI_sub_categories flag is not needed as there are no intersections with" "MCI subcategories.") # Construction of supplementary train supplementary_train_df = pd.DataFrame() for diagnosis in supplementary_diagnoses: sup_baseline_train_df = pd.read_csv(path.join( train_path, f"{diagnosis}_baseline.tsv"), sep="\t") supplementary_train_df = pd.concat( [supplementary_train_df, sup_baseline_train_df]) sub_df = (supplementary_train_df.reset_index().groupby( "participant_id")["session_id"].nunique()) logger.debug( f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans" ) supplementary_train_df.reset_index(drop=True, inplace=True) # MCI selection MCI_df.reset_index(inplace=True) baseline_df = extract_baseline(MCI_df) train_df, test_df = create_split( "MCI", baseline_df, categorical_split_variable, n_test=n_test, p_age_threshold=p_age_threshold, p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, supplementary_train_df=supplementary_train_df, ) # Write selection of MCI train_df.to_csv(path.join(train_path, "MCI_baseline.tsv"), sep="\t", index=False) test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv(path.join(train_path, "MCI.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, "MCI.tsv"), sep="\t", index=False)
def generate_trivial_dataset( caps_directory, output_dir, n_subjects, tsv_path=None, preprocessing="t1-linear", mask_path=None, atrophy_percent=60, multi_cohort=False, ): """ Generates a fully separable dataset. Generates a dataset, based on the images of the CAPS directory, where a half of the image is processed using a mask to oclude a specific region. This procedure creates a dataset fully separable (images with half-right processed and image with half-left processed) Args: caps_directory: (str) path to the CAPS directory. output_dir: (str) folder containing the synthetic dataset in CAPS format. n_subjects: (int) number of subjects in each class of the synthetic dataset. tsv_path: (str) path to tsv file of list of subjects/sessions. preprocessing: (str) preprocessing performed. Must be in ['linear', 'extensive']. mask_path: (str) path to the extracted masks to generate the two labels. atrophy_percent: (float) percentage of atrophy applied. multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths. Returns: Folder structure where images are stored in CAPS format. Raises: ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`. """ from pathlib import Path commandline_to_json({ "output_dir": output_dir, "caps_dir": caps_directory, "preprocessing": preprocessing, "n_subjects": n_subjects, "atrophy_percent": atrophy_percent, }) # Transform caps_directory in dict caps_dict = CapsDataset.create_caps_dict(caps_directory, multi_cohort=multi_cohort) # Read DataFrame data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir) data_df = extract_baseline(data_df) home = str(Path.home()) cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks") url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/" FILE1 = RemoteFileStructure( filename="AAL2.tar.gz", url=url_aramis, checksum= "89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471", ) makedirs(cache_clinicadl, exist_ok=True) if n_subjects > len(data_df): raise ValueError( f"The number of subjects {n_subjects} cannot be higher " f"than the number of subjects in the baseline dataset of size {len(data_df)}" ) if mask_path is None: if not exists(join(cache_clinicadl, "AAL2")): try: print("Try to download AAL2 masks") mask_path_tar = fetch_file(FILE1, cache_clinicadl) tar_file = tarfile.open(mask_path_tar) print("File: " + mask_path_tar) try: tar_file.extractall(cache_clinicadl) tar_file.close() mask_path = join(cache_clinicadl, "AAL2") except RuntimeError: print("Unable to extract downloaded files") except IOError as err: print("Unable to download required templates:", err) raise ValueError( """Unable to download masks, please download them manually at https://aramislab.paris.inria.fr/files/data/masks/ and provide a valid path.""") else: mask_path = join(cache_clinicadl, "AAL2") # Create subjects dir makedirs(join(output_dir, "subjects"), exist_ok=True) # Output tsv file columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"] output_df = pd.DataFrame(columns=columns) diagnosis_list = ["AD", "CN"] for i in range(2 * n_subjects): data_idx = i // 2 label = i % 2 participant_id = data_df.loc[data_idx, "participant_id"] session_id = data_df.loc[data_idx, "session_id"] cohort = data_df.loc[data_idx, "cohort"] filename = f"sub-TRIV{i}_ses-M00" + FILENAME_TYPE["cropped"] + ".nii.gz" path_image = join(output_dir, "subjects", f"sub-TRIV{i}", "ses-M00", "t1_linear") makedirs(path_image, exist_ok=True) image_path = find_image_path(caps_dict, participant_id, session_id, cohort, preprocessing) image_nii = nib.load(image_path) image = image_nii.get_data() atlas_to_mask = nib.load(join(mask_path, f"mask-{label + 1}.nii")).get_data() # Create atrophied image trivial_image = im_loss_roi_gaussian_distribution( image, atlas_to_mask, atrophy_percent) trivial_image_nii = nib.Nifti1Image(trivial_image, affine=image_nii.affine) trivial_image_nii.to_filename(join(path_image, filename)) # Append row to output tsv row = [f"sub-TRIV{i}", "ses-M00", diagnosis_list[label], 60, "F"] row_df = pd.DataFrame([row], columns=columns) output_df = output_df.append(row_df) output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False) missing_path = join(output_dir, "missing_mods") makedirs(missing_path, exist_ok=True) sessions = output_df.session_id.unique() for session in sessions: session_df = output_df[output_df.session_id == session] out_df = copy(session_df[["participant_id"]]) out_df["synthetic"] = [1] * len(out_df) out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"), sep="\t", index=False)