Python retrieve_longitudinal Exemples

Langage de programmation: Python

Espace de nommage/Pack: clinicadl.utils.tsvtools_utils

Méthode/Fonction: retrieve_longitudinal

Exemples au hotexamples.com: 2

Python retrieve_longitudinal - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de clinicadl.utils.tsvtools_utils.retrieve_longitudinal extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

def write_splits( diagnosis: str, diagnosis_df: pd.DataFrame, split_label: str, n_splits: int, train_path: str, test_path: str, supplementary_diagnoses: List[str] = None, ) -> None: """ Split data at the subject-level in training and test to have equivalent distributions in split_label. Writes test and train Dataframes. Args: diagnosis: diagnosis on which the split is done diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis'] split_label: label on which the split is done (categorical variables) n_splits: Number of splits in the k-fold cross-validation. train_path: Path to the training data. test_path: Path to the test data. supplementary_diagnoses: List of supplementary diagnoses to add to the data. """ baseline_df = extract_baseline(diagnosis_df) if split_label is None: diagnoses_list = list(baseline_df.diagnosis) unique = list(set(diagnoses_list)) y = np.array([unique.index(x) for x in diagnoses_list]) else: stratification_list = list(baseline_df[split_label]) unique = list(set(stratification_list)) y = np.array([unique.index(x) for x in stratification_list]) splits = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2) print(f"Label {diagnosis}") for i, indices in enumerate(splits.split(np.zeros(len(y)), y)): print(f"Split {i}") train_index, test_index = indices test_df = baseline_df.iloc[test_index] train_df = baseline_df.iloc[train_index] if supplementary_diagnoses is not None: for supplementary_diagnosis in supplementary_diagnoses: sup_train_df = pd.read_csv( path.join( train_path, f"split-{i}", f"{supplementary_diagnosis}_baseline.tsv", ), sep="\t", ) train_df = pd.concat([train_df, sup_train_df]) sup_test_df = pd.read_csv( path.join( test_path, f"split-{i}", f"{supplementary_diagnosis}_baseline.tsv", ), sep="\t", ) test_df = pd.concat([test_df, sup_test_df]) train_df.reset_index(inplace=True, drop=True) test_df.reset_index(inplace=True, drop=True) train_df.to_csv( path.join(train_path, f"split-{i}", f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) test_df.to_csv( path.join(test_path, f"split-{i}", f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv( path.join(train_path, f"split-{i}", f"{diagnosis}.tsv"), sep="\t", index=False, )

Exemple #2

0

Afficher le fichier

def split_diagnoses( formatted_data_path, n_test=100, subset_name="test", MCI_sub_categories=True, p_age_threshold=0.80, p_sex_threshold=0.80, categorical_split_variable=None, ignore_demographics=False, verbose=0, ): """ Performs a single split for each label independently on the subject level. The train folder will contain two lists per diagnosis (baseline and longitudinal), whereas the test folder will only include the list of baseline sessions. The age and sex distributions between the two sets must be non-significant (according to T-test and chi-square). Args: formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels. n_test (float): If >= 1, number of subjects to put in set with name 'subset_name'. If < 1, proportion of subjects to put in set with name 'subset_name'. If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name'. subset_name (str): Name of the subset that is complementary to train. MCI_sub_categories (bool): If True, manages MCI sub-categories to avoid data leakage. p_age_threshold (float): The threshold used for the T-test on age distributions. p_sex_threshold (float): The threshold used for the T-test on sex distributions. categorical_split_variable (str): name of a categorical variable to perform a stratified split. ignore_demographics (bool): If True the diagnoses are split without taking into account the demographics distributions (age, sex). verbose (int): level of verbosity. Returns: writes three files per <label>.tsv file present in formatted_data_path: - formatted_data_path/train/<label>.tsv - formatted_data_path/train/<label>_baseline.tsv - formatted_data_path/<subset_name>/<label>_baseline.tsv """ commandline_to_json( { "output_dir": formatted_data_path, "n_test": n_test, "subset_name": subset_name, "MCI_sub_categories": MCI_sub_categories, "p_age_threshold": p_age_threshold, "p_sex_threshold": p_sex_threshold, "categorical_split_variable": categorical_split_variable, "ignore_demographics": ignore_demographics, }, filename="split.json", ) # Read files results_path = formatted_data_path train_path = path.join(results_path, "train") if path.exists(train_path): shutil.rmtree(train_path) if n_test > 0: os.makedirs(train_path) if categorical_split_variable is None: categorical_split_variable = "diagnosis" test_path = path.join(results_path, subset_name) if path.exists(test_path): shutil.rmtree(test_path) os.makedirs(test_path) diagnosis_df_paths = os.listdir(results_path) diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")] diagnosis_df_paths = [ x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv") ] MCI_special_treatment = False if "MCI.tsv" in diagnosis_df_paths and n_test > 0: if MCI_sub_categories: diagnosis_df_paths.remove("MCI.tsv") MCI_special_treatment = True elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths: logger.warning( "MCI special treatment was deactivated though MCI subgroups were found." "Be aware that it may cause data leakage in transfer learning tasks." ) # The baseline session must be kept before or we are taking all the sessions to mix them for diagnosis_df_path in diagnosis_df_paths: diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path), sep="\t") interest_columns = diagnosis_df.columns.values diagnosis = diagnosis_df_path.split(".")[0] logger.info(f"Running split for diagnosis {diagnosis}") if n_test > 0: train_df, test_df = create_split( diagnosis, diagnosis_df, categorical_split_variable, n_test=n_test, p_age_threshold=p_age_threshold, p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, ) # Save baseline splits train_df.to_csv( path.join(train_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False, ) test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv(path.join(train_path, f"{diagnosis}.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False) else: baseline_df = extract_baseline(diagnosis_df) test_df = baseline_df[interest_columns] test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"), sep="\t", index=False) if MCI_special_treatment: # Extraction of MCI subjects without intersection with the sMCI / pMCI train diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"), sep="\t") MCI_df = diagnosis_df.set_index(["participant_id", "session_id"]) baseline_df = extract_baseline(MCI_df, set_index=False) if n_test > 1: n_test = int(n_test) else: n_test = int(n_test * len(baseline_df)) MCI_df, supplementary_diagnoses = remove_sub_labels( MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path) if len(supplementary_diagnoses) == 0: raise ClinicaDLArgumentError( "The MCI_sub_categories flag is not needed as there are no intersections with" "MCI subcategories.") # Construction of supplementary train supplementary_train_df = pd.DataFrame() for diagnosis in supplementary_diagnoses: sup_baseline_train_df = pd.read_csv(path.join( train_path, f"{diagnosis}_baseline.tsv"), sep="\t") supplementary_train_df = pd.concat( [supplementary_train_df, sup_baseline_train_df]) sub_df = (supplementary_train_df.reset_index().groupby( "participant_id")["session_id"].nunique()) logger.debug( f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans" ) supplementary_train_df.reset_index(drop=True, inplace=True) # MCI selection MCI_df.reset_index(inplace=True) baseline_df = extract_baseline(MCI_df) train_df, test_df = create_split( "MCI", baseline_df, categorical_split_variable, n_test=n_test, p_age_threshold=p_age_threshold, p_sex_threshold=p_sex_threshold, ignore_demographics=ignore_demographics, supplementary_train_df=supplementary_train_df, ) # Write selection of MCI train_df.to_csv(path.join(train_path, "MCI_baseline.tsv"), sep="\t", index=False) test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"), sep="\t", index=False) long_train_df = retrieve_longitudinal(train_df, diagnosis_df) long_train_df.to_csv(path.join(train_path, "MCI.tsv"), sep="\t", index=False) long_test_df = retrieve_longitudinal(test_df, diagnosis_df) long_test_df.to_csv(path.join(test_path, "MCI.tsv"), sep="\t", index=False)