예제 #1
0
파일: generate.py 프로젝트: mdiazmel/AD-DL
def generate_shepplogan_dataset(output_dir,
                                img_size,
                                labels_distribution,
                                samples=100,
                                smoothing=True):

    check_and_clean(join(output_dir, "subjects"))
    commandline_to_json({
        "output_dir": output_dir,
        "img_size": img_size,
        "labels_distribution": labels_distribution,
        "samples": samples,
        "smoothing": smoothing,
    })
    columns = ["participant_id", "session_id", "diagnosis", "subtype"]
    data_df = pd.DataFrame(columns=columns)

    for i, label in enumerate(labels_distribution.keys()):
        samples_per_subtype = np.array(labels_distribution[label]) * samples
        for subtype in range(len(samples_per_subtype)):
            for j in range(int(samples_per_subtype[subtype])):
                participant_id = "sub-CLNC%i%04d" % (
                    i,
                    j + np.sum(samples_per_subtype[:subtype:]).astype(int),
                )
                session_id = "ses-M00"
                row_df = pd.DataFrame(
                    [[participant_id, session_id, label, subtype]],
                    columns=columns)
                data_df = data_df.append(row_df)

                # Image generation
                path_out = join(
                    output_dir,
                    "subjects",
                    "%s_%s%s.pt" %
                    (participant_id, session_id, FILENAME_TYPE["shepplogan"]),
                )
                img = generate_shepplogan_phantom(img_size,
                                                  label=subtype,
                                                  smoothing=smoothing)
                torch_img = torch.from_numpy(img).float().unsqueeze(0)
                torch.save(torch_img, path_out)

    data_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    missing_path = join(output_dir, "missing_mods")
    if not exists(missing_path):
        makedirs(missing_path)

    sessions = data_df.session_id.unique()
    for session in sessions:
        session_df = data_df[data_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["t1w"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, "missing_mods_%s.tsv" % session),
                      sep="\t",
                      index=False)
예제 #2
0
def split_diagnoses(
    formatted_data_path: str,
    n_splits: int = 5,
    subset_name: str = "validation",
    MCI_sub_categories: bool = True,
    stratification: str = None,
):
    """
    Performs a k-fold split for each label independently on the subject level.
    The train folder will contain two lists per fold per diagnosis (baseline and longitudinal),
    whereas the test folder will only include the list of baseline sessions for each spli.

    Writes three files per split per <label>.tsv file present in formatted_data_path:
            - formatted_data_path/train_splits-<n_splits>/split-<split>/<label>.tsv
            - formatted_data_path/train_splits-<n_splits>/split-<split>/<label>_baseline.tsv
            - formatted_data_path/<subset_name>_splits-<n_splits>/split-<split>/<label>_baseline.tsv

    Args:
        formatted_data_path: Path to the folder containing data extracted by clinicadl tsvtool getlabels.
        n_splits: Number of splits in the k-fold cross-validation.
        subset_name: Name of the subset that is complementary to train.
        MCI_sub_categories: If True, manages MCI sub-categories to avoid data leakage.
        stratification: Name of variable used to stratify k-fold.
    """
    commandline_to_json(
        {
            "output_dir": formatted_data_path,
            "n_splits": n_splits,
            "subset_name": subset_name,
            "MCI_sub_categories": MCI_sub_categories,
            "stratification": stratification,
        },
        filename="kfold.json",
    )

    # Read files
    results_path = formatted_data_path

    train_path = path.join(results_path, f"train_splits-{n_splits}")
    if path.exists(train_path):
        shutil.rmtree(train_path)
    os.makedirs(train_path)
    for i in range(n_splits):
        os.mkdir(path.join(train_path, f"split-{i}"))

    test_path = path.join(results_path, f"{subset_name}_splits-{n_splits}")
    if path.exists(test_path):
        shutil.rmtree(test_path)
    os.makedirs(test_path)
    for i in range(n_splits):
        os.mkdir(path.join(test_path, f"split-{i}"))

    diagnosis_df_paths = os.listdir(results_path)
    diagnosis_df_paths = [
        x for x in diagnosis_df_paths
        if (x.endswith(".tsv") and not x.endswith("_baseline.tsv"))
    ]

    MCI_special_treatment = False

    if "MCI.tsv" in diagnosis_df_paths:
        if MCI_sub_categories:
            diagnosis_df_paths.remove("MCI.tsv")
            MCI_special_treatment = True
        elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths:
            logger.warning(
                "MCI special treatment was deactivated though MCI subgroups were found. "
                "Be aware that it may cause data leakage in transfer learning tasks."
            )

    # The baseline session must be kept before or we are taking all the sessions to mix them
    for diagnosis_df_path in diagnosis_df_paths:
        diagnosis = diagnosis_df_path.split(".")[0]

        diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path),
                                   sep="\t")
        write_splits(diagnosis, diagnosis_df, stratification, n_splits,
                     train_path, test_path)

        logger.info(f"K-fold split for diagnosis {diagnosis} is done.")

    if MCI_special_treatment:

        # Extraction of MCI subjects without intersection with the sMCI / pMCI train
        diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"),
                                   sep="\t")
        MCI_df = diagnosis_df.set_index(["participant_id", "session_id"])
        MCI_df, supplementary_diagnoses = remove_sub_labels(
            MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path)

        if len(supplementary_diagnoses) == 0:
            raise ClinicaDLArgumentError(
                "The MCI_sub_categories flag is not needed as there are no intersections with "
                "MCI subcategories.")

        MCI_df.reset_index(drop=False, inplace=True)
        logger.debug(MCI_df)
        write_splits(
            "MCI",
            MCI_df,
            stratification,
            n_splits,
            train_path,
            test_path,
            supplementary_diagnoses=supplementary_diagnoses,
        )
        logger.info("K-fold split for diagnosis MCI is done.")
예제 #3
0
파일: getlabels.py 프로젝트: 14thibea/AD-DL
def get_labels(
    merged_tsv: str,
    missing_mods: str,
    results_path: str,
    diagnoses: List[str],
    modality: str = "t1w",
    restriction_path: str = None,
    time_horizon: int = 36,
    variables_of_interest: List[str] = None,
    remove_smc: bool = True,
):
    """
    Writes one TSV file per label in diagnoses argument based on merged_tsv and missing_mods.

    Args:
        merged_tsv: Path to the file obtained by the command clinica iotools merge-tsv.
        missing_mods: Path to the folder where the outputs of clinica iotools check-missing-modalities are.
        results_path: Path to the folder where tsv files are extracted.
        diagnoses: Labels that must be extracted from merged_tsv.
        modality: Modality to select sessions. Sessions which do not include the modality will be excluded.
        restriction_path: Path to a tsv containing the sessions that can be included.
        time_horizon: Time horizon to analyse stability of MCI subjects.
        variables_of_interest: columns that should be kept in the output tsv files.
        remove_smc: if True SMC participants are removed from the lists.
    """

    commandline_to_json(
        {
            "output_dir": results_path,
            "merged_tsv": merged_tsv,
            "missing_mods": missing_mods,
            "diagnoses": diagnoses,
            "modality": modality,
            "restriction_path": restriction_path,
            "time_horizon": time_horizon,
            "variables_of_interest": variables_of_interest,
            "remove_smc": remove_smc,
        },
        filename="getlabels.json",
    )

    # Reading files
    bids_df = pd.read_csv(merged_tsv, sep="\t")
    bids_df.set_index(["participant_id", "session_id"], inplace=True)
    variables_list = ["diagnosis"]
    try:
        variables_list.append(find_label(bids_df.columns.values, "age"))
        variables_list.append(find_label(bids_df.columns.values, "sex"))
    except ValueError:
        logger.warning("The age or sex values were not found in the dataset.")
    if variables_of_interest is not None:
        variables_set = set(variables_of_interest) | set(variables_list)
        variables_list = list(variables_set)
        if not set(variables_list).issubset(set(bids_df.columns.values)):
            raise ValueError(
                f"The variables asked by the user {variables_of_interest} do not "
                f"exist in the data set.")

    list_files = os.listdir(missing_mods)
    missing_mods_dict = {}

    for file in list_files:
        filename, fileext = path.splitext(file)
        if fileext == ".tsv":
            session = filename.split("_")[-1]
            missing_mods_df = pd.read_csv(path.join(missing_mods, file),
                                          sep="\t")
            if len(missing_mods_df) == 0:
                raise ValueError(
                    f"Empty DataFrame at path {path.join(missing_mods, file)}")

            missing_mods_df.set_index("participant_id",
                                      drop=True,
                                      inplace=True)
            missing_mods_dict[session] = missing_mods_df

    # Creating results path
    os.makedirs(results_path, exist_ok=True)

    # Remove SMC patients
    if remove_smc:
        if "diagnosis_bl" in bids_df.columns.values:  # Retro-compatibility
            bids_df = bids_df[~(bids_df.diagnosis_bl == "SMC")]
        if "diagnosis_sc" in bids_df.columns.values:
            bids_df = bids_df[~(bids_df.diagnosis_sc == "SMC")]

    # Adding the field baseline_diagnosis
    bids_copy_df = copy(bids_df)
    bids_copy_df["baseline_diagnosis"] = pd.Series(np.zeros(len(bids_df)),
                                                   index=bids_df.index)
    for subject, subject_df in bids_df.groupby(level=0):
        baseline_diagnosis = subject_df.loc[(subject, "ses-M00"), "diagnosis"]
        bids_copy_df.loc[subject, "baseline_diagnosis"] = baseline_diagnosis

    bids_df = copy(bids_copy_df)

    time_MCI_df = None
    if "AD" in diagnoses:
        logger.info("Beginning the selection of AD label")
        output_df = stable_selection(bids_df, diagnosis="AD")
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "AD.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} AD subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "BV" in diagnoses:
        logger.info("Beginning the selection of BV label")
        output_df = stable_selection(bids_df, diagnosis="BV")
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "BV.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} BV subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "CN" in diagnoses:
        logger.info("Beginning the selection of CN label")
        output_df = stable_selection(bids_df, diagnosis="CN")
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "CN.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} CN subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "MCI" in diagnoses:
        logger.info("Beginning of the selection of MCI label")
        MCI_df = mci_stability(
            bids_df, 10**4)  # Remove rMCI independently from time horizon
        output_df = diagnosis_removal(MCI_df, diagnosis_list=["rMCI"])
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        # Relabelling everything as MCI
        output_df.diagnosis = ["MCI"] * len(output_df)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "MCI.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} MCI subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "sMCI" in diagnoses:
        logger.info("Beginning of the selection of sMCI label")
        time_MCI_df = mci_stability(bids_df, time_horizon)
        output_df = diagnosis_removal(time_MCI_df,
                                      diagnosis_list=["rMCI", "pMCI"])
        output_df = output_df[output_df.diagnosis == "sMCI"]
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "sMCI.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} sMCI subjects for a total of {len(diagnosis_df)} sessions\n"
        )

    if "pMCI" in diagnoses:
        logger.info("Beginning of the selection of pMCI label")
        if time_MCI_df is None:
            time_MCI_df = mci_stability(bids_df, time_horizon)
        output_df = time_MCI_df[time_MCI_df.diagnosis == "pMCI"]
        output_df = mod_selection(output_df, missing_mods_dict, modality)
        output_df = apply_restriction(output_df, restriction_path)

        diagnosis_df = output_df[variables_list]
        diagnosis_df.to_csv(path.join(results_path, "pMCI.tsv"), sep="\t")
        sub_df = (diagnosis_df.reset_index().groupby("participant_id")
                  ["session_id"].nunique())
        logger.info(
            f"Found {len(sub_df)} pMCI subjects for a total of {len(diagnosis_df)} sessions\n"
        )
예제 #4
0
def split_diagnoses(
    formatted_data_path,
    n_test=100,
    subset_name="test",
    MCI_sub_categories=True,
    p_age_threshold=0.80,
    p_sex_threshold=0.80,
    categorical_split_variable=None,
    ignore_demographics=False,
    verbose=0,
):
    """
    Performs a single split for each label independently on the subject level.
    The train folder will contain two lists per diagnosis (baseline and longitudinal),
    whereas the test folder will only include the list of baseline sessions.

    The age and sex distributions between the two sets must be non-significant (according to T-test and chi-square).

    Args:
        formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels.
        n_test (float):
            If >= 1, number of subjects to put in set with name 'subset_name'.
            If < 1, proportion of subjects to put in set with name 'subset_name'.
            If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name'.
        subset_name (str): Name of the subset that is complementary to train.
        MCI_sub_categories (bool): If True, manages MCI sub-categories to avoid data leakage.
        p_age_threshold (float): The threshold used for the T-test on age distributions.
        p_sex_threshold (float): The threshold used for the T-test on sex distributions.
        categorical_split_variable (str): name of a categorical variable to perform a stratified split.
        ignore_demographics (bool): If True the diagnoses are split without taking into account the demographics
            distributions (age, sex).
        verbose (int): level of verbosity.

    Returns:
        writes three files per <label>.tsv file present in formatted_data_path:
            - formatted_data_path/train/<label>.tsv
            - formatted_data_path/train/<label>_baseline.tsv
            - formatted_data_path/<subset_name>/<label>_baseline.tsv
    """
    commandline_to_json(
        {
            "output_dir": formatted_data_path,
            "n_test": n_test,
            "subset_name": subset_name,
            "MCI_sub_categories": MCI_sub_categories,
            "p_age_threshold": p_age_threshold,
            "p_sex_threshold": p_sex_threshold,
            "categorical_split_variable": categorical_split_variable,
            "ignore_demographics": ignore_demographics,
        },
        filename="split.json",
    )

    # Read files
    results_path = formatted_data_path

    train_path = path.join(results_path, "train")
    if path.exists(train_path):
        shutil.rmtree(train_path)
    if n_test > 0:
        os.makedirs(train_path)

    if categorical_split_variable is None:
        categorical_split_variable = "diagnosis"

    test_path = path.join(results_path, subset_name)
    if path.exists(test_path):
        shutil.rmtree(test_path)
    os.makedirs(test_path)

    diagnosis_df_paths = os.listdir(results_path)
    diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")]
    diagnosis_df_paths = [
        x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv")
    ]

    MCI_special_treatment = False

    if "MCI.tsv" in diagnosis_df_paths and n_test > 0:
        if MCI_sub_categories:
            diagnosis_df_paths.remove("MCI.tsv")
            MCI_special_treatment = True
        elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths:
            logger.warning(
                "MCI special treatment was deactivated though MCI subgroups were found."
                "Be aware that it may cause data leakage in transfer learning tasks."
            )

    # The baseline session must be kept before or we are taking all the sessions to mix them
    for diagnosis_df_path in diagnosis_df_paths:
        diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path),
                                   sep="\t")
        interest_columns = diagnosis_df.columns.values
        diagnosis = diagnosis_df_path.split(".")[0]
        logger.info(f"Running split for diagnosis {diagnosis}")
        if n_test > 0:
            train_df, test_df = create_split(
                diagnosis,
                diagnosis_df,
                categorical_split_variable,
                n_test=n_test,
                p_age_threshold=p_age_threshold,
                p_sex_threshold=p_sex_threshold,
                ignore_demographics=ignore_demographics,
            )
            # Save baseline splits
            train_df.to_csv(
                path.join(train_path, f"{diagnosis}_baseline.tsv"),
                sep="\t",
                index=False,
            )
            test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"),
                           sep="\t",
                           index=False)

            long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
            long_train_df.to_csv(path.join(train_path, f"{diagnosis}.tsv"),
                                 sep="\t",
                                 index=False)
            long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
            long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"),
                                sep="\t",
                                index=False)

        else:
            baseline_df = extract_baseline(diagnosis_df)
            test_df = baseline_df[interest_columns]
            test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"),
                           sep="\t",
                           index=False)
            long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
            long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"),
                                sep="\t",
                                index=False)

    if MCI_special_treatment:

        # Extraction of MCI subjects without intersection with the sMCI / pMCI train
        diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"),
                                   sep="\t")
        MCI_df = diagnosis_df.set_index(["participant_id", "session_id"])
        baseline_df = extract_baseline(MCI_df, set_index=False)

        if n_test > 1:
            n_test = int(n_test)
        else:
            n_test = int(n_test * len(baseline_df))

        MCI_df, supplementary_diagnoses = remove_sub_labels(
            MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path)
        if len(supplementary_diagnoses) == 0:
            raise ClinicaDLArgumentError(
                "The MCI_sub_categories flag is not needed as there are no intersections with"
                "MCI subcategories.")

        # Construction of supplementary train
        supplementary_train_df = pd.DataFrame()
        for diagnosis in supplementary_diagnoses:
            sup_baseline_train_df = pd.read_csv(path.join(
                train_path, f"{diagnosis}_baseline.tsv"),
                                                sep="\t")
            supplementary_train_df = pd.concat(
                [supplementary_train_df, sup_baseline_train_df])
            sub_df = (supplementary_train_df.reset_index().groupby(
                "participant_id")["session_id"].nunique())
            logger.debug(
                f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans"
            )

        supplementary_train_df.reset_index(drop=True, inplace=True)

        # MCI selection
        MCI_df.reset_index(inplace=True)
        baseline_df = extract_baseline(MCI_df)

        train_df, test_df = create_split(
            "MCI",
            baseline_df,
            categorical_split_variable,
            n_test=n_test,
            p_age_threshold=p_age_threshold,
            p_sex_threshold=p_sex_threshold,
            ignore_demographics=ignore_demographics,
            supplementary_train_df=supplementary_train_df,
        )

        # Write selection of MCI
        train_df.to_csv(path.join(train_path, "MCI_baseline.tsv"),
                        sep="\t",
                        index=False)
        test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"),
                       sep="\t",
                       index=False)

        long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
        long_train_df.to_csv(path.join(train_path, "MCI.tsv"),
                             sep="\t",
                             index=False)
        long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
        long_test_df.to_csv(path.join(test_path, "MCI.tsv"),
                            sep="\t",
                            index=False)
예제 #5
0
파일: generate.py 프로젝트: 14thibea/AD-DL
def generate_random_dataset(
    caps_directory: str,
    output_dir: str,
    n_subjects: int,
    tsv_path: Optional[str] = None,
    mean: float = 0,
    sigma: float = 0.5,
    preprocessing: str = "t1-linear",
    multi_cohort: bool = False,
    uncropped_image: bool = False,
    acq_label: Optional[str] = None,
    suvr_reference_region: Optional[str] = None,
):
    """
    Generates a random dataset.

    Creates a random dataset for intractable classification task from the first
    subject of the tsv file (other subjects/sessions different from the first
    one are ignored. Degree of noise can be parameterized.

    Args:
        caps_directory: Path to the (input) CAPS directory.
        output_dir: folder containing the synthetic dataset in (output)
            CAPS format.
        n_subjects: number of subjects in each class of the
            synthetic dataset
        tsv_path: path to tsv file of list of subjects/sessions.
        mean: mean of the gaussian noise
        sigma: standard deviation of the gaussian noise
        preprocessing: preprocessing performed. Must be in ['t1-linear', 't1-extensive'].
        multi_cohort: If True caps_directory is the path to a TSV file linking cohort names and paths.
        uncropped_image: If True the uncropped image of `t1-linear` or `pet-linear` will be used.
        acq_label: name of the tracer when using `pet-linear` preprocessing.
        suvr_reference_region: name of the reference region when using `pet-linear` preprocessing.

    Returns:
        A folder written on the output_dir location (in CAPS format), also a
        tsv file describing this output

    """
    commandline_to_json(
        {
            "output_dir": output_dir,
            "caps_dir": caps_directory,
            "preprocessing": preprocessing,
            "n_subjects": n_subjects,
            "mean": mean,
            "sigma": sigma,
        }
    )
    # Transform caps_directory in dict
    caps_dict = CapsDataset.create_caps_dict(caps_directory, multi_cohort=multi_cohort)

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir)

    # Create subjects dir
    makedirs(join(output_dir, "subjects"), exist_ok=True)

    # Retrieve image of first subject
    participant_id = data_df.loc[0, "participant_id"]
    session_id = data_df.loc[0, "session_id"]
    cohort = data_df.loc[0, "cohort"]

    # Find appropriate preprocessing file type
    file_type = find_file_type(
        preprocessing, uncropped_image, acq_label, suvr_reference_region
    )

    image_paths = clinica_file_reader(
        [participant_id], [session_id], caps_dict[cohort], file_type
    )
    image_nii = nib.load(image_paths[0])
    image = image_nii.get_data()

    # Create output tsv file
    participant_id_list = [f"sub-RAND{i}" for i in range(2 * n_subjects)]
    session_id_list = ["ses-M00"] * 2 * n_subjects
    diagnosis_list = ["AD"] * n_subjects + ["CN"] * n_subjects
    data = np.array([participant_id_list, session_id_list, diagnosis_list])
    data = data.T
    output_df = pd.DataFrame(
        data, columns=["participant_id", "session_id", "diagnosis"]
    )
    output_df["age_bl"] = 60
    output_df["sex"] = "F"
    output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    input_filename = basename(image_paths[0])
    filename_pattern = "_".join(input_filename.split("_")[2::])
    for i in range(2 * n_subjects):
        gauss = np.random.normal(mean, sigma, image.shape)
        participant_id = f"sub-RAND{i}"
        noisy_image = image + gauss
        noisy_image_nii = nib.Nifti1Image(
            noisy_image, header=image_nii.header, affine=image_nii.affine
        )
        noisy_image_nii_path = join(
            output_dir, "subjects", participant_id, "ses-M00", "t1_linear"
        )
        noisy_image_nii_filename = f"{participant_id}_ses-M00_{filename_pattern}"
        makedirs(noisy_image_nii_path, exist_ok=True)
        nib.save(noisy_image_nii, join(noisy_image_nii_path, noisy_image_nii_filename))

    write_missing_mods(output_dir, output_df)
예제 #6
0
파일: generate.py 프로젝트: 14thibea/AD-DL
def generate_shepplogan_dataset(
    output_dir: str,
    img_size: int,
    labels_distribution: Dict[str, Tuple[float, float, float]],
    extract_json: str = None,
    samples: int = 100,
    smoothing: bool = True,
):
    """
    Creates a CAPS data set of synthetic data based on Shepp-Logan phantom.
    Source NifTi files are not extracted, but directly the slices as tensors.

    Args:
        output_dir: path to the CAPS created.
        img_size: size of the square image.
        labels_distribution: gives the proportions of the three subtypes (ordered in a tuple) for each label.
        extract_json: name of the JSON file in which generation details are stored.
        samples: number of samples generated per class.
        smoothing: if True, an additional random smoothing is performed on top of all operations on each image.
    """

    check_and_clean(join(output_dir, "subjects"))
    commandline_to_json(
        {
            "output_dir": output_dir,
            "img_size": img_size,
            "labels_distribution": labels_distribution,
            "samples": samples,
            "smoothing": smoothing,
        }
    )
    columns = ["participant_id", "session_id", "diagnosis", "subtype"]
    data_df = pd.DataFrame(columns=columns)

    for i, label in enumerate(labels_distribution.keys()):
        for j in range(samples):
            participant_id = "sub-CLNC%i%04d" % (i, j)
            session_id = "ses-M00"
            subtype = np.random.choice(
                np.arange(len(labels_distribution[label])), p=labels_distribution[label]
            )
            row_df = pd.DataFrame(
                [[participant_id, session_id, label, subtype]], columns=columns
            )
            data_df = data_df.append(row_df)

            # Image generation
            slice_path = join(
                output_dir,
                "subjects",
                participant_id,
                session_id,
                "deeplearning_prepare_data",
                "slice_based",
                "custom",
                f"{participant_id}_{session_id}_space-SheppLogan_axis-axi_channel-single_slice-0_phantom.pt",
            )
            slice_dir = dirname(slice_path)
            makedirs(slice_dir, exist_ok=True)

            slice_np = generate_shepplogan_phantom(
                img_size, label=subtype, smoothing=smoothing
            )
            slice_tensor = torch.from_numpy(slice_np).float().unsqueeze(0)
            torch.save(slice_tensor, slice_path)

            image_path = join(
                output_dir,
                "subjects",
                participant_id,
                session_id,
                "shepplogan",
                f"{participant_id}_{session_id}_space-SheppLogan_phantom.nii.gz",
            )
            image_dir = dirname(image_path)
            makedirs(image_dir, exist_ok=True)
            with open(image_path, "w") as f:
                f.write("0")

    # Save data
    data_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    # Save preprocessing JSON file
    preprocessing_dict = {
        "preprocessing": "custom",
        "mode": "slice",
        "use_uncropped_image": False,
        "prepare_dl": True,
        "extract_json": compute_extract_json(extract_json),
        "slice_direction": 2,
        "slice_mode": "single",
        "discarded_slices": 0,
        "num_slices": 1,
        "file_type": {
            "pattern": f"*_space-SheppLogan_phantom.nii.gz",
            "description": "Custom suffix",
            "needed_pipeline": "shepplogan",
        },
    }
    write_preprocessing(preprocessing_dict, output_dir)
    write_missing_mods(output_dir, data_df)
예제 #7
0
파일: generate.py 프로젝트: 14thibea/AD-DL
def generate_trivial_dataset(
    caps_directory: str,
    output_dir: str,
    n_subjects: int,
    tsv_path: Optional[str] = None,
    preprocessing: str = "t1-linear",
    mask_path: Optional[str] = None,
    atrophy_percent: float = 60,
    multi_cohort: bool = False,
    uncropped_image: bool = False,
    acq_label: str = "fdg",
    suvr_reference_region: str = "pons",
):
    """
    Generates a fully separable dataset.

    Generates a dataset, based on the images of the CAPS directory, where a
    half of the image is processed using a mask to occlude a specific region.
    This procedure creates a dataset fully separable (images with half-right
    processed and image with half-left processed)

    Args:
        caps_directory: path to the CAPS directory.
        output_dir: folder containing the synthetic dataset in CAPS format.
        n_subjects: number of subjects in each class of the synthetic dataset.
        tsv_path: path to tsv file of list of subjects/sessions.
        preprocessing: preprocessing performed. Must be in ['linear', 'extensive'].
        mask_path: path to the extracted masks to generate the two labels.
        atrophy_percent: percentage of atrophy applied.
        multi_cohort: If True caps_directory is the path to a TSV file linking cohort names and paths.
        uncropped_image: If True the uncropped image of `t1-linear` or `pet-linear` will be used.
        acq_label: name of the tracer when using `pet-linear` preprocessing.
        suvr_reference_region: name of the reference region when using `pet-linear` preprocessing.

    Returns:
        Folder structure where images are stored in CAPS format.

    Raises:
        ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`.
    """
    from pathlib import Path

    commandline_to_json(
        {
            "output_dir": output_dir,
            "caps_dir": caps_directory,
            "preprocessing": preprocessing,
            "n_subjects": n_subjects,
            "atrophy_percent": atrophy_percent,
        }
    )

    # Transform caps_directory in dict
    caps_dict = CapsDataset.create_caps_dict(caps_directory, multi_cohort=multi_cohort)

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir)
    data_df = extract_baseline(data_df)

    home = str(Path.home())
    cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks")
    url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/"
    FILE1 = RemoteFileStructure(
        filename="AAL2.tar.gz",
        url=url_aramis,
        checksum="89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471",
    )
    makedirs(cache_clinicadl, exist_ok=True)

    if n_subjects > len(data_df):
        raise ValueError(
            f"The number of subjects {n_subjects} cannot be higher "
            f"than the number of subjects in the baseline dataset of size {len(data_df)}"
        )

    if mask_path is None:
        if not exists(join(cache_clinicadl, "AAL2")):
            try:
                print("Try to download AAL2 masks")
                mask_path_tar = fetch_file(FILE1, cache_clinicadl)
                tar_file = tarfile.open(mask_path_tar)
                print("File: " + mask_path_tar)
                try:
                    tar_file.extractall(cache_clinicadl)
                    tar_file.close()
                    mask_path = join(cache_clinicadl, "AAL2")
                except RuntimeError:
                    print("Unable to extract downloaded files")
            except IOError as err:
                print("Unable to download required templates:", err)
                raise ValueError(
                    """Unable to download masks, please download them
                                  manually at https://aramislab.paris.inria.fr/files/data/masks/
                                  and provide a valid path."""
                )
        else:
            mask_path = join(cache_clinicadl, "AAL2")

    # Create subjects dir
    makedirs(join(output_dir, "subjects"), exist_ok=True)

    # Output tsv file
    columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"]
    output_df = pd.DataFrame(columns=columns)
    diagnosis_list = ["AD", "CN"]

    # Find appropriate preprocessing file type
    file_type = find_file_type(
        preprocessing, uncropped_image, acq_label, suvr_reference_region
    )

    for i in range(2 * n_subjects):
        data_idx = i // 2
        label = i % 2

        participant_id = data_df.loc[data_idx, "participant_id"]
        session_id = data_df.loc[data_idx, "session_id"]
        cohort = data_df.loc[data_idx, "cohort"]
        image_paths = clinica_file_reader(
            [participant_id], [session_id], caps_dict[cohort], file_type
        )
        image_nii = nib.load(image_paths[0])
        image = image_nii.get_data()

        input_filename = basename(image_paths[0])
        filename_pattern = "_".join(input_filename.split("_")[2::])

        trivial_image_nii_dir = join(
            output_dir, "subjects", f"sub-TRIV{i}", session_id, preprocessing
        )
        trivial_image_nii_filename = f"sub-TRIV{i}_{session_id}_{filename_pattern}"

        makedirs(trivial_image_nii_dir, exist_ok=True)

        atlas_to_mask = nib.load(join(mask_path, f"mask-{label + 1}.nii")).get_data()

        # Create atrophied image
        trivial_image = im_loss_roi_gaussian_distribution(
            image, atlas_to_mask, atrophy_percent
        )
        trivial_image_nii = nib.Nifti1Image(trivial_image, affine=image_nii.affine)
        trivial_image_nii.to_filename(
            join(trivial_image_nii_dir, trivial_image_nii_filename)
        )
        print(join(trivial_image_nii_dir, trivial_image_nii_filename))

        # Append row to output tsv
        row = [f"sub-TRIV{i}", session_id, diagnosis_list[label], 60, "F"]
        row_df = pd.DataFrame([row], columns=columns)
        output_df = output_df.append(row_df)

    output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    write_missing_mods(output_dir, output_df)
예제 #8
0
파일: generate.py 프로젝트: mdiazmel/AD-DL
def generate_random_dataset(
    caps_directory,
    output_dir,
    n_subjects,
    tsv_path=None,
    mean=0,
    sigma=0.5,
    preprocessing="t1-linear",
    multi_cohort=False,
):
    """
    Generates a random dataset.

    Creates a random dataset for intractable classification task from the first
    subject of the tsv file (other subjects/sessions different from the first
    one are ignored. Degree of noise can be parameterized.

    Args:
        caps_directory: (str) Path to the (input) CAPS directory.
        output_dir: (str) folder containing the synthetic dataset in (output)
            CAPS format.
        n_subjects: (int) number of subjects in each class of the
            synthetic dataset
        tsv_path: (str) path to tsv file of list of subjects/sessions.
        mean: (float) mean of the gaussian noise
        sigma: (float) standard deviation of the gaussian noise
        preprocessing: (str) preprocessing performed. Must be in ['t1-linear', 't1-extensive'].
        multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths.

    Returns:
        A folder written on the output_dir location (in CAPS format), also a
        tsv file describing this output

    """
    commandline_to_json({
        "output_dir": output_dir,
        "caps_dir": caps_directory,
        "preprocessing": preprocessing,
        "n_subjects": n_subjects,
        "mean": mean,
        "sigma": sigma,
    })
    # Transform caps_directory in dict
    caps_dict = CapsDataset.create_caps_dict(caps_directory,
                                             multi_cohort=multi_cohort)

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir)

    # Create subjects dir
    makedirs(join(output_dir, "subjects"), exist_ok=True)

    # Retrieve image of first subject
    participant_id = data_df.loc[0, "participant_id"]
    session_id = data_df.loc[0, "session_id"]
    cohort = data_df.loc[0, "cohort"]

    image_path = find_image_path(caps_dict, participant_id, session_id, cohort,
                                 preprocessing)
    image_nii = nib.load(image_path)
    image = image_nii.get_data()

    # Create output tsv file
    participant_id_list = [f"sub-RAND{i}" for i in range(2 * n_subjects)]
    session_id_list = ["ses-M00"] * 2 * n_subjects
    diagnosis_list = ["AD"] * n_subjects + ["CN"] * n_subjects
    data = np.array([participant_id_list, session_id_list, diagnosis_list])
    data = data.T
    output_df = pd.DataFrame(
        data, columns=["participant_id", "session_id", "diagnosis"])
    output_df["age_bl"] = 60
    output_df["sex"] = "F"
    output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    for i in range(2 * n_subjects):
        gauss = np.random.normal(mean, sigma, image.shape)
        participant_id = f"sub-RAND{i}"
        noisy_image = image + gauss
        noisy_image_nii = nib.Nifti1Image(noisy_image,
                                          header=image_nii.header,
                                          affine=image_nii.affine)
        noisy_image_nii_path = join(output_dir, "subjects", participant_id,
                                    "ses-M00", "t1_linear")
        noisy_image_nii_filename = (participant_id + "_ses-M00" +
                                    FILENAME_TYPE["cropped"] + ".nii.gz")
        makedirs(noisy_image_nii_path, exist_ok=True)
        nib.save(noisy_image_nii,
                 join(noisy_image_nii_path, noisy_image_nii_filename))

    missing_path = join(output_dir, "missing_mods")
    makedirs(missing_path, exist_ok=True)

    sessions = output_df.session_id.unique()
    for session in sessions:
        session_df = output_df[output_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["synthetic"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"),
                      sep="\t",
                      index=False)
예제 #9
0
파일: generate.py 프로젝트: mdiazmel/AD-DL
def generate_trivial_dataset(
    caps_directory,
    output_dir,
    n_subjects,
    tsv_path=None,
    preprocessing="t1-linear",
    mask_path=None,
    atrophy_percent=60,
    multi_cohort=False,
):
    """
    Generates a fully separable dataset.

    Generates a dataset, based on the images of the CAPS directory, where a
    half of the image is processed using a mask to oclude a specific region.
    This procedure creates a dataset fully separable (images with half-right
    processed and image with half-left processed)

    Args:
        caps_directory: (str) path to the CAPS directory.
        output_dir: (str) folder containing the synthetic dataset in CAPS format.
        n_subjects: (int) number of subjects in each class of the synthetic
            dataset.
        tsv_path: (str) path to tsv file of list of subjects/sessions.
        preprocessing: (str) preprocessing performed. Must be in ['linear', 'extensive'].
        mask_path: (str) path to the extracted masks to generate the two labels.
        atrophy_percent: (float) percentage of atrophy applied.
        multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths.

    Returns:
        Folder structure where images are stored in CAPS format.

    Raises:
        ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`.
    """
    from pathlib import Path

    commandline_to_json({
        "output_dir": output_dir,
        "caps_dir": caps_directory,
        "preprocessing": preprocessing,
        "n_subjects": n_subjects,
        "atrophy_percent": atrophy_percent,
    })

    # Transform caps_directory in dict
    caps_dict = CapsDataset.create_caps_dict(caps_directory,
                                             multi_cohort=multi_cohort)

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir)
    data_df = extract_baseline(data_df)

    home = str(Path.home())
    cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks")
    url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/"
    FILE1 = RemoteFileStructure(
        filename="AAL2.tar.gz",
        url=url_aramis,
        checksum=
        "89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471",
    )
    makedirs(cache_clinicadl, exist_ok=True)

    if n_subjects > len(data_df):
        raise ValueError(
            f"The number of subjects {n_subjects} cannot be higher "
            f"than the number of subjects in the baseline dataset of size {len(data_df)}"
        )

    if mask_path is None:
        if not exists(join(cache_clinicadl, "AAL2")):
            try:
                print("Try to download AAL2 masks")
                mask_path_tar = fetch_file(FILE1, cache_clinicadl)
                tar_file = tarfile.open(mask_path_tar)
                print("File: " + mask_path_tar)
                try:
                    tar_file.extractall(cache_clinicadl)
                    tar_file.close()
                    mask_path = join(cache_clinicadl, "AAL2")
                except RuntimeError:
                    print("Unable to extract downloaded files")
            except IOError as err:
                print("Unable to download required templates:", err)
                raise ValueError(
                    """Unable to download masks, please download them
                                  manually at https://aramislab.paris.inria.fr/files/data/masks/
                                  and provide a valid path.""")
        else:
            mask_path = join(cache_clinicadl, "AAL2")

    # Create subjects dir
    makedirs(join(output_dir, "subjects"), exist_ok=True)

    # Output tsv file
    columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"]
    output_df = pd.DataFrame(columns=columns)
    diagnosis_list = ["AD", "CN"]

    for i in range(2 * n_subjects):
        data_idx = i // 2
        label = i % 2

        participant_id = data_df.loc[data_idx, "participant_id"]
        session_id = data_df.loc[data_idx, "session_id"]
        cohort = data_df.loc[data_idx, "cohort"]
        filename = f"sub-TRIV{i}_ses-M00" + FILENAME_TYPE["cropped"] + ".nii.gz"
        path_image = join(output_dir, "subjects", f"sub-TRIV{i}", "ses-M00",
                          "t1_linear")

        makedirs(path_image, exist_ok=True)

        image_path = find_image_path(caps_dict, participant_id, session_id,
                                     cohort, preprocessing)
        image_nii = nib.load(image_path)
        image = image_nii.get_data()

        atlas_to_mask = nib.load(join(mask_path,
                                      f"mask-{label + 1}.nii")).get_data()

        # Create atrophied image
        trivial_image = im_loss_roi_gaussian_distribution(
            image, atlas_to_mask, atrophy_percent)
        trivial_image_nii = nib.Nifti1Image(trivial_image,
                                            affine=image_nii.affine)
        trivial_image_nii.to_filename(join(path_image, filename))

        # Append row to output tsv
        row = [f"sub-TRIV{i}", "ses-M00", diagnosis_list[label], 60, "F"]
        row_df = pd.DataFrame([row], columns=columns)
        output_df = output_df.append(row_df)

    output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    missing_path = join(output_dir, "missing_mods")
    makedirs(missing_path, exist_ok=True)

    sessions = output_df.session_id.unique()
    for session in sessions:
        session_df = output_df[output_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["synthetic"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"),
                      sep="\t",
                      index=False)