Exemple #1
0
def write_splits(
    diagnosis: str,
    diagnosis_df: pd.DataFrame,
    split_label: str,
    n_splits: int,
    train_path: str,
    test_path: str,
    supplementary_diagnoses: List[str] = None,
) -> None:
    """
    Split data at the subject-level in training and test to have equivalent distributions in split_label.
    Writes test and train Dataframes.

    Args:
        diagnosis: diagnosis on which the split is done
        diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis']
        split_label: label on which the split is done (categorical variables)
        n_splits: Number of splits in the k-fold cross-validation.
        train_path: Path to the training data.
        test_path: Path to the test data.
        supplementary_diagnoses: List of supplementary diagnoses to add to the data.
    """

    baseline_df = extract_baseline(diagnosis_df)
    if split_label is None:
        diagnoses_list = list(baseline_df.diagnosis)
        unique = list(set(diagnoses_list))
        y = np.array([unique.index(x) for x in diagnoses_list])
    else:
        stratification_list = list(baseline_df[split_label])
        unique = list(set(stratification_list))
        y = np.array([unique.index(x) for x in stratification_list])

    splits = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)

    print(f"Label {diagnosis}")
    for i, indices in enumerate(splits.split(np.zeros(len(y)), y)):
        print(f"Split {i}")
        train_index, test_index = indices

        test_df = baseline_df.iloc[test_index]
        train_df = baseline_df.iloc[train_index]

        if supplementary_diagnoses is not None:
            for supplementary_diagnosis in supplementary_diagnoses:
                sup_train_df = pd.read_csv(
                    path.join(
                        train_path,
                        f"split-{i}",
                        f"{supplementary_diagnosis}_baseline.tsv",
                    ),
                    sep="\t",
                )
                train_df = pd.concat([train_df, sup_train_df])
                sup_test_df = pd.read_csv(
                    path.join(
                        test_path,
                        f"split-{i}",
                        f"{supplementary_diagnosis}_baseline.tsv",
                    ),
                    sep="\t",
                )
                test_df = pd.concat([test_df, sup_test_df])

            train_df.reset_index(inplace=True, drop=True)
            test_df.reset_index(inplace=True, drop=True)

        train_df.to_csv(
            path.join(train_path, f"split-{i}", f"{diagnosis}_baseline.tsv"),
            sep="\t",
            index=False,
        )
        test_df.to_csv(
            path.join(test_path, f"split-{i}", f"{diagnosis}_baseline.tsv"),
            sep="\t",
            index=False,
        )

        long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
        long_train_df.to_csv(
            path.join(train_path, f"split-{i}", f"{diagnosis}.tsv"),
            sep="\t",
            index=False,
        )
Exemple #2
0
def split_diagnoses(
    formatted_data_path,
    n_test=100,
    subset_name="test",
    MCI_sub_categories=True,
    p_age_threshold=0.80,
    p_sex_threshold=0.80,
    categorical_split_variable=None,
    ignore_demographics=False,
    verbose=0,
):
    """
    Performs a single split for each label independently on the subject level.
    The train folder will contain two lists per diagnosis (baseline and longitudinal),
    whereas the test folder will only include the list of baseline sessions.

    The age and sex distributions between the two sets must be non-significant (according to T-test and chi-square).

    Args:
        formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels.
        n_test (float):
            If >= 1, number of subjects to put in set with name 'subset_name'.
            If < 1, proportion of subjects to put in set with name 'subset_name'.
            If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name'.
        subset_name (str): Name of the subset that is complementary to train.
        MCI_sub_categories (bool): If True, manages MCI sub-categories to avoid data leakage.
        p_age_threshold (float): The threshold used for the T-test on age distributions.
        p_sex_threshold (float): The threshold used for the T-test on sex distributions.
        categorical_split_variable (str): name of a categorical variable to perform a stratified split.
        ignore_demographics (bool): If True the diagnoses are split without taking into account the demographics
            distributions (age, sex).
        verbose (int): level of verbosity.

    Returns:
        writes three files per <label>.tsv file present in formatted_data_path:
            - formatted_data_path/train/<label>.tsv
            - formatted_data_path/train/<label>_baseline.tsv
            - formatted_data_path/<subset_name>/<label>_baseline.tsv
    """
    commandline_to_json(
        {
            "output_dir": formatted_data_path,
            "n_test": n_test,
            "subset_name": subset_name,
            "MCI_sub_categories": MCI_sub_categories,
            "p_age_threshold": p_age_threshold,
            "p_sex_threshold": p_sex_threshold,
            "categorical_split_variable": categorical_split_variable,
            "ignore_demographics": ignore_demographics,
        },
        filename="split.json",
    )

    # Read files
    results_path = formatted_data_path

    train_path = path.join(results_path, "train")
    if path.exists(train_path):
        shutil.rmtree(train_path)
    if n_test > 0:
        os.makedirs(train_path)

    if categorical_split_variable is None:
        categorical_split_variable = "diagnosis"

    test_path = path.join(results_path, subset_name)
    if path.exists(test_path):
        shutil.rmtree(test_path)
    os.makedirs(test_path)

    diagnosis_df_paths = os.listdir(results_path)
    diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")]
    diagnosis_df_paths = [
        x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv")
    ]

    MCI_special_treatment = False

    if "MCI.tsv" in diagnosis_df_paths and n_test > 0:
        if MCI_sub_categories:
            diagnosis_df_paths.remove("MCI.tsv")
            MCI_special_treatment = True
        elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths:
            logger.warning(
                "MCI special treatment was deactivated though MCI subgroups were found."
                "Be aware that it may cause data leakage in transfer learning tasks."
            )

    # The baseline session must be kept before or we are taking all the sessions to mix them
    for diagnosis_df_path in diagnosis_df_paths:
        diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path),
                                   sep="\t")
        interest_columns = diagnosis_df.columns.values
        diagnosis = diagnosis_df_path.split(".")[0]
        logger.info(f"Running split for diagnosis {diagnosis}")
        if n_test > 0:
            train_df, test_df = create_split(
                diagnosis,
                diagnosis_df,
                categorical_split_variable,
                n_test=n_test,
                p_age_threshold=p_age_threshold,
                p_sex_threshold=p_sex_threshold,
                ignore_demographics=ignore_demographics,
            )
            # Save baseline splits
            train_df.to_csv(
                path.join(train_path, f"{diagnosis}_baseline.tsv"),
                sep="\t",
                index=False,
            )
            test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"),
                           sep="\t",
                           index=False)

            long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
            long_train_df.to_csv(path.join(train_path, f"{diagnosis}.tsv"),
                                 sep="\t",
                                 index=False)
            long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
            long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"),
                                sep="\t",
                                index=False)

        else:
            baseline_df = extract_baseline(diagnosis_df)
            test_df = baseline_df[interest_columns]
            test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"),
                           sep="\t",
                           index=False)
            long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
            long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"),
                                sep="\t",
                                index=False)

    if MCI_special_treatment:

        # Extraction of MCI subjects without intersection with the sMCI / pMCI train
        diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"),
                                   sep="\t")
        MCI_df = diagnosis_df.set_index(["participant_id", "session_id"])
        baseline_df = extract_baseline(MCI_df, set_index=False)

        if n_test > 1:
            n_test = int(n_test)
        else:
            n_test = int(n_test * len(baseline_df))

        MCI_df, supplementary_diagnoses = remove_sub_labels(
            MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path)
        if len(supplementary_diagnoses) == 0:
            raise ClinicaDLArgumentError(
                "The MCI_sub_categories flag is not needed as there are no intersections with"
                "MCI subcategories.")

        # Construction of supplementary train
        supplementary_train_df = pd.DataFrame()
        for diagnosis in supplementary_diagnoses:
            sup_baseline_train_df = pd.read_csv(path.join(
                train_path, f"{diagnosis}_baseline.tsv"),
                                                sep="\t")
            supplementary_train_df = pd.concat(
                [supplementary_train_df, sup_baseline_train_df])
            sub_df = (supplementary_train_df.reset_index().groupby(
                "participant_id")["session_id"].nunique())
            logger.debug(
                f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans"
            )

        supplementary_train_df.reset_index(drop=True, inplace=True)

        # MCI selection
        MCI_df.reset_index(inplace=True)
        baseline_df = extract_baseline(MCI_df)

        train_df, test_df = create_split(
            "MCI",
            baseline_df,
            categorical_split_variable,
            n_test=n_test,
            p_age_threshold=p_age_threshold,
            p_sex_threshold=p_sex_threshold,
            ignore_demographics=ignore_demographics,
            supplementary_train_df=supplementary_train_df,
        )

        # Write selection of MCI
        train_df.to_csv(path.join(train_path, "MCI_baseline.tsv"),
                        sep="\t",
                        index=False)
        test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"),
                       sep="\t",
                       index=False)

        long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
        long_train_df.to_csv(path.join(train_path, "MCI.tsv"),
                             sep="\t",
                             index=False)
        long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
        long_test_df.to_csv(path.join(test_path, "MCI.tsv"),
                            sep="\t",
                            index=False)