Esempio n. 1
0
def create_split(
    diagnosis,
    diagnosis_df,
    split_label,
    n_test,
    p_age_threshold=0.80,
    p_sex_threshold=0.80,
    supplementary_train_df=None,
    ignore_demographics=False,
):
    """
    Split data at the subject-level in training and test set with equivalent age, sex and split_label distributions

    Args:
        diagnosis: (str) diagnosis on which the split is done
        diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis']
        split_label: (str) label on which the split is done (categorical variables)
        n_test: (float)
            If > 1 number of subjects to put in the test set.
            If < 1 proportion of subjects to put in the test set.
        p_age_threshold: (float) threshold for the t-test on age.
        p_sex_threshold: (float) threshold for the chi2 test on sex.
        supplementary_train_df: (DataFrame) Add data that must be included in the train set.
        ignore_demographics: (bool): If True the diagnoses are split without taking into account the demographics
            distributions (age, sex).

    Returns:
        train_df (DataFrame) subjects in the train set
        test_df (DataFrame) subjects in the test set
    """

    if supplementary_train_df is not None:
        sup_train_sex = [
            sex_dict[x] for x in supplementary_train_df.sex.values
        ]
        sup_train_age = [float(x) for x in supplementary_train_df.age.values]
    else:
        sup_train_sex = []
        sup_train_age = []

    baseline_df = extract_baseline(diagnosis_df)

    if n_test >= 1:
        n_test = int(n_test)
    else:
        n_test = int(n_test * len(baseline_df))

    if not {split_label}.issubset(set(baseline_df.columns.values)):
        raise ClinicaDLArgumentError(
            f"The column {split_label} is missing."
            f"Please add it using the --variables_of_interest flag in getlabels."
        )

    if not ignore_demographics:
        try:
            sex_label = find_label(baseline_df.columns.values, "sex")
            age_label = find_label(baseline_df.columns.values, "age")
        except ClinicaDLArgumentError:
            raise ClinicaDLArgumentError(
                "This dataset do not have age or sex values. "
                "Please add the flag --ignore_demographics to split "
                "without trying to balance age or sex distributions.")

        sex = list(baseline_df[sex_label].values)
        age = list(baseline_df[age_label].values)
        category = list(baseline_df[split_label].values)
        category = category_conversion(category)
        category = remove_unicity(category)

        flag_selection = True
        n_try = 0

        while flag_selection:

            splits = StratifiedShuffleSplit(n_splits=1, test_size=n_test)
            for train_index, test_index in splits.split(category, category):

                # Find the value for different demographics (age & sex)
                if len(set(age)) != 1:
                    age_test = [float(age[idx]) for idx in test_index]
                    age_train = [float(age[idx])
                                 for idx in train_index] + sup_train_age
                    _, p_age = ttest_ind(age_test,
                                         age_train,
                                         nan_policy="omit")
                else:
                    p_age = 1

                if len(set(sex)) != 1:
                    sex_test = [sex_dict[sex[idx]] for idx in test_index]
                    sex_train = [sex_dict[sex[idx]]
                                 for idx in train_index] + sup_train_sex
                    _, p_sex = chi2(sex_test, sex_train)
                else:
                    p_sex = 1

                logger.debug(f"p_age={p_age:.2f}, p_sex={p_sex:.4f}")

                if p_sex >= p_sex_threshold and p_age >= p_age_threshold:
                    flag_selection = False
                    test_df = baseline_df.loc[test_index]
                    train_df = baseline_df.loc[train_index]
                    if supplementary_train_df is not None:
                        train_df = pd.concat(
                            [train_df, supplementary_train_df])
                        train_df.reset_index(drop=True, inplace=True)

                n_try += 1
        logger.info(
            f"Split for diagnosis {diagnosis} was found after {n_try} trials.")

    else:
        idx = np.arange(len(baseline_df))
        idx_test = np.random.choice(idx, size=n_test, replace=False)
        idx_test.sort()
        idx_train = complementary_list(idx, idx_test)
        test_df = baseline_df.loc[idx_test]
        train_df = baseline_df.loc[idx_train]

    return train_df, test_df
Esempio n. 2
0
def get_labels(
    merged_tsv: str,
    missing_mods: str,
    results_path: str,
    diagnoses: List[str],
    modality: str = "t1w",
    restriction_path: str = None,
    time_horizon: int = 36,
    variables_of_interest: List[str] = None,
    remove_smc: bool = True,
):
    """
    Writes one TSV file per label in diagnoses argument based on merged_tsv and missing_mods.

    Args:
        merged_tsv: Path to the file obtained by the command clinica iotools merge-tsv.
        missing_mods: Path to the folder where the outputs of clinica iotools check-missing-modalities are.
        results_path: Path to the folder where tsv files are extracted.
        diagnoses: Labels that must be extracted from merged_tsv.
        modality: Modality to select sessions. Sessions which do not include the modality will be excluded.
        restriction_path: Path to a tsv containing the sessions that can be included.
        time_horizon: Time horizon to analyse stability of MCI subjects.
        variables_of_interest: columns that should be kept in the output tsv files.
        remove_smc: if True SMC participants are removed from the lists.
    """

    commandline_to_json(
        {
            "output_dir": results_path,
            "merged_tsv": merged_tsv,
            "missing_mods": missing_mods,
            "diagnoses": diagnoses,
            "modality": modality,
            "restriction_path": restriction_path,
            "time_horizon": time_horizon,
            "variables_of_interest": variables_of_interest,
            "remove_smc": remove_smc,
        },
        filename="getlabels.json",
    )

    # Reading files
    bids_df = pd.read_csv(merged_tsv, sep="\t")
    bids_df.set_index(["participant_id", "session_id"], inplace=True)
    variables_list = ["diagnosis"]
    try:
        variables_list.append(find_label(bids_df.columns.values, "age"))
        variables_list.append(find_label(bids_df.columns.values, "sex"))
    except ValueError:
        logger.warning("The age or sex values were not found in the dataset.")
    if variables_of_interest is not None:
        variables_set = set(variables_of_interest) | set(variables_list)
        variables_list = list(variables_set)
        if not set(variables_list).issubset(set(bids_df.columns.values)):
            raise ValueError(
                f"The variables asked by the user {variables_of_interest} do not "
                f"exist in the data set.")

    list_files = os.listdir(missing_mods)
    missing_mods_dict = {}

    for file in list_files:
        filename, fileext = path.splitext(file)
        if fileext == ".tsv":
            session = filename.split("_")[-1]
            missing_mods_df = pd.read_csv(path.join(missing_mods, file),
                                          sep="\t")
            if len(missing_mods_df) == 0:
                raise ValueError(
                    f"Empty DataFrame at path {path.join(missing_mods, file)}")

            missing_mods_df.set_index("participant_id",
                                      drop=True,
                                      inplace=True)
            missing_mods_dict[session] = missing_mods_df

    # Creating results path
    os.makedirs(results_path, exist_ok=True)

    # Remove SMC patients
    if remove_smc:
        if "diagnosis_bl" in bids_df.columns.values:  # Retro-compatibility
            bids_df = bids_df[~(bids_df.diagnosis_bl == "SMC")]
        if "diagnosis_sc" in bids_df.columns.values:
            bids_df = bids_df[~(bids_df.diagnosis_sc == "SMC")]

    # Adding the field baseline_diagnosis
    bids_copy_df = copy(bids_df)
    bids_copy_df["baseline_diagnosis"] = pd.Series(np.zeros(len(bids_df)),
                                                   index=bids_df.index)
    for subject, subject_df in bids_df.groupby(level=0):
        baseline_diagnosis = subject_df.loc[(subject, "ses-M00"), "diagnosis"]
        bids_copy_df.loc[subject, "baseline_diagnosis"] = baseline_diagnosis

    bids_df = copy(bids_copy_df)

    time_MCI_df = None
    if "AD" in diagnoses:
        logger.info("Beginning the selection of AD label")
        output_df = stable_selection(bids_df, diagnosis="AD")
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "AD.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} AD subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "BV" in diagnoses:
        logger.info("Beginning the selection of BV label")
        output_df = stable_selection(bids_df, diagnosis="BV")
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "BV.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} BV subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "CN" in diagnoses:
        logger.info("Beginning the selection of CN label")
        output_df = stable_selection(bids_df, diagnosis="CN")
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "CN.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} CN subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "MCI" in diagnoses:
        logger.info("Beginning of the selection of MCI label")
        MCI_df = mci_stability(
            bids_df, 10**4)  # Remove rMCI independently from time horizon
        output_df = diagnosis_removal(MCI_df, diagnosis_list=["rMCI"])
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        # Relabelling everything as MCI
        output_df.diagnosis = ["MCI"] * len(output_df)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "MCI.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} MCI subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "sMCI" in diagnoses:
        logger.info("Beginning of the selection of sMCI label")
        time_MCI_df = mci_stability(bids_df, time_horizon)
        output_df = diagnosis_removal(time_MCI_df,
                                      diagnosis_list=["rMCI", "pMCI"])
        output_df = output_df[output_df.diagnosis == "sMCI"]
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "sMCI.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} sMCI subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "pMCI" in diagnoses:
        logger.info("Beginning of the selection of pMCI label")
        if time_MCI_df is None:
            time_MCI_df = mci_stability(bids_df, time_horizon)
        output_df = time_MCI_df[time_MCI_df.diagnosis == "pMCI"]
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "pMCI.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} pMCI subjects for a total of {len(diagnosis_df)} sessions\n"
        )
Esempio n. 3
0
def demographics_analysis(merged_tsv, formatted_data_path, results_path,
                          diagnoses):
    """
    Produces a tsv file with rows corresponding to the labels defined by the diagnoses list,
    and the columns being demographic statistics.

    Args:
        merged_tsv (str): Path to the file obtained by the command clinica iotools merge-tsv.
        formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels.
        results_path (str): Path to the output tsv file (filename included).
        diagnoses (list): Labels selected for the demographic analysis.

    Returns:
        writes one tsv file at results_path containing the
        demographic analysis of the tsv files in formatted_data_path.
    """

    merged_df = pd.read_csv(merged_tsv, sep="\t")
    merged_df.set_index(["participant_id", "session_id"], inplace=True)
    parent_directory = path.abspath(path.join(results_path, os.pardir))
    os.makedirs(parent_directory, exist_ok=True)

    fields_dict = {
        "age": find_label(merged_df.columns.values, "age"),
        "sex": find_label(merged_df.columns.values, "sex"),
        "MMSE": find_label(merged_df.columns.values, "mms"),
        "CDR": "cdr_global",
    }

    columns = [
        "n_subjects",
        "mean_age",
        "std_age",
        "min_age",
        "max_age",
        "sexF",
        "sexM",
        "mean_MMSE",
        "std_MMSE",
        "min_MMSE",
        "max_MMSE",
        "CDR_0",
        "CDR_0.5",
        "CDR_1",
        "CDR_2",
        "CDR_3",
        "mean_scans",
        "std_scans",
        "n_scans",
    ]
    results_df = pd.DataFrame(index=diagnoses,
                              columns=columns,
                              data=np.zeros((len(diagnoses), len(columns))))

    # Need all values for mean and variance (age, MMSE and scans)
    diagnosis_dict = dict.fromkeys(diagnoses)
    for diagnosis in diagnoses:
        diagnosis_dict[diagnosis] = {"age": [], "MMSE": [], "scans": []}
        diagnosis_path = path.join(formatted_data_path, diagnosis + ".tsv")
        if not path.exists(diagnosis_path):
            print("TSV file with all sessions was not found for diagnosis %s. "
                  "Loads baseline version instead." % diagnosis)
            diagnosis_path = path.join(formatted_data_path,
                                       diagnosis + "_baseline.tsv")
        diagnosis_df = pd.read_csv(diagnosis_path, sep="\t")
        diagnosis_demographics_df = add_demographics(diagnosis_df, merged_df,
                                                     diagnosis)
        diagnosis_demographics_df.set_index(["participant_id", "session_id"],
                                            inplace=True)
        diagnosis_df.set_index(["participant_id", "session_id"], inplace=True)

        for subject, subject_df in diagnosis_df.groupby(level=0):
            first_session_id = first_session(subject_df)
            feature_absence = isinstance(
                merged_df.loc[(subject, first_session_id), "diagnosis"], float)
            while feature_absence:
                first_session_id = next_session(subject_df, first_session_id)
                feature_absence = isinstance(
                    merged_df.loc[(subject, first_session_id), "diagnosis"],
                    float)
            demographics_subject_df = merged_df.loc[subject]

            # Extract features
            results_df.loc[diagnosis, "n_subjects"] += 1
            results_df.loc[diagnosis, "n_scans"] += len(subject_df)
            diagnosis_dict[diagnosis]["age"].append(
                merged_df.loc[(subject, first_session_id), fields_dict["age"]])
            diagnosis_dict[diagnosis]["MMSE"].append(
                merged_df.loc[(subject, first_session_id),
                              fields_dict["MMSE"]])
            diagnosis_dict[diagnosis]["scans"].append(len(subject_df))
            sexF = (len(demographics_subject_df[(
                demographics_subject_df[fields_dict["sex"]].isin(["F"]))]) > 0)
            sexM = (len(demographics_subject_df[(
                demographics_subject_df[fields_dict["sex"]].isin(["M"]))]) > 0)
            if sexF:
                results_df.loc[diagnosis, "sexF"] += 1
            elif sexM:
                results_df.loc[diagnosis, "sexM"] += 1
            else:
                raise ValueError("Patient %s has no sex" % subject)

            cdr = merged_df.at[(subject, first_session_id), fields_dict["CDR"]]
            if cdr == 0:
                results_df.loc[diagnosis, "CDR_0"] += 1
            elif cdr == 0.5:
                results_df.loc[diagnosis, "CDR_0.5"] += 1
            elif cdr == 1:
                results_df.loc[diagnosis, "CDR_1"] += 1
            elif cdr == 2:
                results_df.loc[diagnosis, "CDR_2"] += 1
            elif cdr == 3:
                results_df.loc[diagnosis, "CDR_3"] += 1
            else:
                warn(f"Patient {subject} has CDR {cdr}")

    for diagnosis in diagnoses:
        results_df.loc[diagnosis, "mean_age"] = np.nanmean(
            diagnosis_dict[diagnosis]["age"])
        results_df.loc[diagnosis,
                       "std_age"] = np.nanstd(diagnosis_dict[diagnosis]["age"])
        results_df.loc[diagnosis,
                       "min_age"] = np.nanmin(diagnosis_dict[diagnosis]["age"])
        results_df.loc[diagnosis,
                       "max_age"] = np.nanmax(diagnosis_dict[diagnosis]["age"])
        results_df.loc[diagnosis, "mean_MMSE"] = np.nanmean(
            diagnosis_dict[diagnosis]["MMSE"])
        results_df.loc[diagnosis, "std_MMSE"] = np.nanstd(
            diagnosis_dict[diagnosis]["MMSE"])
        results_df.loc[diagnosis, "min_MMSE"] = np.nanmin(
            diagnosis_dict[diagnosis]["MMSE"])
        results_df.loc[diagnosis, "max_MMSE"] = np.nanmax(
            diagnosis_dict[diagnosis]["MMSE"])
        results_df.loc[diagnosis, "mean_scans"] = np.nanmean(
            diagnosis_dict[diagnosis]["scans"])
        results_df.loc[diagnosis, "std_scans"] = np.nanstd(
            diagnosis_dict[diagnosis]["scans"])

        for key in diagnosis_dict[diagnosis]:
            if np.isnan(diagnosis_dict[diagnosis][key]).any():
                warn(
                    f"NaN values were found for {key} values associated to diagnosis {diagnosis}"
                )

    results_df.index.name = "diagnosis"

    results_df.to_csv(results_path, sep="\t")