Python extract_baselineの例

プログラミング言語: Python

名前空間/パッケージ名: clinicadl.utils.tsvtools_utils

メソッド/関数: extract_baseline

hotexamples.comのコード掲載数: 5

Python extract_baseline - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのclinicadl.utils.tsvtools_utils.extract_baselineの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def write_splits(
    diagnosis: str,
    diagnosis_df: pd.DataFrame,
    split_label: str,
    n_splits: int,
    train_path: str,
    test_path: str,
    supplementary_diagnoses: List[str] = None,
) -> None:
    """
    Split data at the subject-level in training and test to have equivalent distributions in split_label.
    Writes test and train Dataframes.

    Args:
        diagnosis: diagnosis on which the split is done
        diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis']
        split_label: label on which the split is done (categorical variables)
        n_splits: Number of splits in the k-fold cross-validation.
        train_path: Path to the training data.
        test_path: Path to the test data.
        supplementary_diagnoses: List of supplementary diagnoses to add to the data.
    """

    baseline_df = extract_baseline(diagnosis_df)
    if split_label is None:
        diagnoses_list = list(baseline_df.diagnosis)
        unique = list(set(diagnoses_list))
        y = np.array([unique.index(x) for x in diagnoses_list])
    else:
        stratification_list = list(baseline_df[split_label])
        unique = list(set(stratification_list))
        y = np.array([unique.index(x) for x in stratification_list])

    splits = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)

    print(f"Label {diagnosis}")
    for i, indices in enumerate(splits.split(np.zeros(len(y)), y)):
        print(f"Split {i}")
        train_index, test_index = indices

        test_df = baseline_df.iloc[test_index]
        train_df = baseline_df.iloc[train_index]

        if supplementary_diagnoses is not None:
            for supplementary_diagnosis in supplementary_diagnoses:
                sup_train_df = pd.read_csv(
                    path.join(
                        train_path,
                        f"split-{i}",
                        f"{supplementary_diagnosis}_baseline.tsv",
                    ),
                    sep="\t",
                )
                train_df = pd.concat([train_df, sup_train_df])
                sup_test_df = pd.read_csv(
                    path.join(
                        test_path,
                        f"split-{i}",
                        f"{supplementary_diagnosis}_baseline.tsv",
                    ),
                    sep="\t",
                )
                test_df = pd.concat([test_df, sup_test_df])

            train_df.reset_index(inplace=True, drop=True)
            test_df.reset_index(inplace=True, drop=True)

        train_df.to_csv(
            path.join(train_path, f"split-{i}", f"{diagnosis}_baseline.tsv"),
            sep="\t",
            index=False,
        )
        test_df.to_csv(
            path.join(test_path, f"split-{i}", f"{diagnosis}_baseline.tsv"),
            sep="\t",
            index=False,
        )

        long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
        long_train_df.to_csv(
            path.join(train_path, f"split-{i}", f"{diagnosis}.tsv"),
            sep="\t",
            index=False,
        )

コード例 #2

ファイルを表示

def create_split(
    diagnosis,
    diagnosis_df,
    split_label,
    n_test,
    p_age_threshold=0.80,
    p_sex_threshold=0.80,
    supplementary_train_df=None,
    ignore_demographics=False,
):
    """
    Split data at the subject-level in training and test set with equivalent age, sex and split_label distributions

    Args:
        diagnosis: (str) diagnosis on which the split is done
        diagnosis_df: DataFrame with columns including ['participant_id', 'session_id', 'diagnosis']
        split_label: (str) label on which the split is done (categorical variables)
        n_test: (float)
            If > 1 number of subjects to put in the test set.
            If < 1 proportion of subjects to put in the test set.
        p_age_threshold: (float) threshold for the t-test on age.
        p_sex_threshold: (float) threshold for the chi2 test on sex.
        supplementary_train_df: (DataFrame) Add data that must be included in the train set.
        ignore_demographics: (bool): If True the diagnoses are split without taking into account the demographics
            distributions (age, sex).

    Returns:
        train_df (DataFrame) subjects in the train set
        test_df (DataFrame) subjects in the test set
    """

    if supplementary_train_df is not None:
        sup_train_sex = [
            sex_dict[x] for x in supplementary_train_df.sex.values
        ]
        sup_train_age = [float(x) for x in supplementary_train_df.age.values]
    else:
        sup_train_sex = []
        sup_train_age = []

    baseline_df = extract_baseline(diagnosis_df)

    if n_test >= 1:
        n_test = int(n_test)
    else:
        n_test = int(n_test * len(baseline_df))

    if not {split_label}.issubset(set(baseline_df.columns.values)):
        raise ClinicaDLArgumentError(
            f"The column {split_label} is missing."
            f"Please add it using the --variables_of_interest flag in getlabels."
        )

    if not ignore_demographics:
        try:
            sex_label = find_label(baseline_df.columns.values, "sex")
            age_label = find_label(baseline_df.columns.values, "age")
        except ClinicaDLArgumentError:
            raise ClinicaDLArgumentError(
                "This dataset do not have age or sex values. "
                "Please add the flag --ignore_demographics to split "
                "without trying to balance age or sex distributions.")

        sex = list(baseline_df[sex_label].values)
        age = list(baseline_df[age_label].values)
        category = list(baseline_df[split_label].values)
        category = category_conversion(category)
        category = remove_unicity(category)

        flag_selection = True
        n_try = 0

        while flag_selection:

            splits = StratifiedShuffleSplit(n_splits=1, test_size=n_test)
            for train_index, test_index in splits.split(category, category):

                # Find the value for different demographics (age & sex)
                if len(set(age)) != 1:
                    age_test = [float(age[idx]) for idx in test_index]
                    age_train = [float(age[idx])
                                 for idx in train_index] + sup_train_age
                    _, p_age = ttest_ind(age_test,
                                         age_train,
                                         nan_policy="omit")
                else:
                    p_age = 1

                if len(set(sex)) != 1:
                    sex_test = [sex_dict[sex[idx]] for idx in test_index]
                    sex_train = [sex_dict[sex[idx]]
                                 for idx in train_index] + sup_train_sex
                    _, p_sex = chi2(sex_test, sex_train)
                else:
                    p_sex = 1

                logger.debug(f"p_age={p_age:.2f}, p_sex={p_sex:.4f}")

                if p_sex >= p_sex_threshold and p_age >= p_age_threshold:
                    flag_selection = False
                    test_df = baseline_df.loc[test_index]
                    train_df = baseline_df.loc[train_index]
                    if supplementary_train_df is not None:
                        train_df = pd.concat(
                            [train_df, supplementary_train_df])
                        train_df.reset_index(drop=True, inplace=True)

                n_try += 1
        logger.info(
            f"Split for diagnosis {diagnosis} was found after {n_try} trials.")

    else:
        idx = np.arange(len(baseline_df))
        idx_test = np.random.choice(idx, size=n_test, replace=False)
        idx_test.sort()
        idx_train = complementary_list(idx, idx_test)
        test_df = baseline_df.loc[idx_test]
        train_df = baseline_df.loc[idx_train]

    return train_df, test_df

コード例 #3

ファイルを表示

ファイル: generate.py プロジェクト: 14thibea/AD-DL

def generate_trivial_dataset(
    caps_directory: str,
    output_dir: str,
    n_subjects: int,
    tsv_path: Optional[str] = None,
    preprocessing: str = "t1-linear",
    mask_path: Optional[str] = None,
    atrophy_percent: float = 60,
    multi_cohort: bool = False,
    uncropped_image: bool = False,
    acq_label: str = "fdg",
    suvr_reference_region: str = "pons",
):
    """
    Generates a fully separable dataset.

    Generates a dataset, based on the images of the CAPS directory, where a
    half of the image is processed using a mask to occlude a specific region.
    This procedure creates a dataset fully separable (images with half-right
    processed and image with half-left processed)

    Args:
        caps_directory: path to the CAPS directory.
        output_dir: folder containing the synthetic dataset in CAPS format.
        n_subjects: number of subjects in each class of the synthetic dataset.
        tsv_path: path to tsv file of list of subjects/sessions.
        preprocessing: preprocessing performed. Must be in ['linear', 'extensive'].
        mask_path: path to the extracted masks to generate the two labels.
        atrophy_percent: percentage of atrophy applied.
        multi_cohort: If True caps_directory is the path to a TSV file linking cohort names and paths.
        uncropped_image: If True the uncropped image of `t1-linear` or `pet-linear` will be used.
        acq_label: name of the tracer when using `pet-linear` preprocessing.
        suvr_reference_region: name of the reference region when using `pet-linear` preprocessing.

    Returns:
        Folder structure where images are stored in CAPS format.

    Raises:
        ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`.
    """
    from pathlib import Path

    commandline_to_json(
        {
            "output_dir": output_dir,
            "caps_dir": caps_directory,
            "preprocessing": preprocessing,
            "n_subjects": n_subjects,
            "atrophy_percent": atrophy_percent,
        }
    )

    # Transform caps_directory in dict
    caps_dict = CapsDataset.create_caps_dict(caps_directory, multi_cohort=multi_cohort)

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir)
    data_df = extract_baseline(data_df)

    home = str(Path.home())
    cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks")
    url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/"
    FILE1 = RemoteFileStructure(
        filename="AAL2.tar.gz",
        url=url_aramis,
        checksum="89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471",
    )
    makedirs(cache_clinicadl, exist_ok=True)

    if n_subjects > len(data_df):
        raise ValueError(
            f"The number of subjects {n_subjects} cannot be higher "
            f"than the number of subjects in the baseline dataset of size {len(data_df)}"
        )

    if mask_path is None:
        if not exists(join(cache_clinicadl, "AAL2")):
            try:
                print("Try to download AAL2 masks")
                mask_path_tar = fetch_file(FILE1, cache_clinicadl)
                tar_file = tarfile.open(mask_path_tar)
                print("File: " + mask_path_tar)
                try:
                    tar_file.extractall(cache_clinicadl)
                    tar_file.close()
                    mask_path = join(cache_clinicadl, "AAL2")
                except RuntimeError:
                    print("Unable to extract downloaded files")
            except IOError as err:
                print("Unable to download required templates:", err)
                raise ValueError(
                    """Unable to download masks, please download them
                                  manually at https://aramislab.paris.inria.fr/files/data/masks/
                                  and provide a valid path."""
                )
        else:
            mask_path = join(cache_clinicadl, "AAL2")

    # Create subjects dir
    makedirs(join(output_dir, "subjects"), exist_ok=True)

    # Output tsv file
    columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"]
    output_df = pd.DataFrame(columns=columns)
    diagnosis_list = ["AD", "CN"]

    # Find appropriate preprocessing file type
    file_type = find_file_type(
        preprocessing, uncropped_image, acq_label, suvr_reference_region
    )

    for i in range(2 * n_subjects):
        data_idx = i // 2
        label = i % 2

        participant_id = data_df.loc[data_idx, "participant_id"]
        session_id = data_df.loc[data_idx, "session_id"]
        cohort = data_df.loc[data_idx, "cohort"]
        image_paths = clinica_file_reader(
            [participant_id], [session_id], caps_dict[cohort], file_type
        )
        image_nii = nib.load(image_paths[0])
        image = image_nii.get_data()

        input_filename = basename(image_paths[0])
        filename_pattern = "_".join(input_filename.split("_")[2::])

        trivial_image_nii_dir = join(
            output_dir, "subjects", f"sub-TRIV{i}", session_id, preprocessing
        )
        trivial_image_nii_filename = f"sub-TRIV{i}_{session_id}_{filename_pattern}"

        makedirs(trivial_image_nii_dir, exist_ok=True)

        atlas_to_mask = nib.load(join(mask_path, f"mask-{label + 1}.nii")).get_data()

        # Create atrophied image
        trivial_image = im_loss_roi_gaussian_distribution(
            image, atlas_to_mask, atrophy_percent
        )
        trivial_image_nii = nib.Nifti1Image(trivial_image, affine=image_nii.affine)
        trivial_image_nii.to_filename(
            join(trivial_image_nii_dir, trivial_image_nii_filename)
        )
        print(join(trivial_image_nii_dir, trivial_image_nii_filename))

        # Append row to output tsv
        row = [f"sub-TRIV{i}", session_id, diagnosis_list[label], 60, "F"]
        row_df = pd.DataFrame([row], columns=columns)
        output_df = output_df.append(row_df)

    output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    write_missing_mods(output_dir, output_df)

コード例 #4

ファイルを表示

def split_diagnoses(
    formatted_data_path,
    n_test=100,
    subset_name="test",
    MCI_sub_categories=True,
    p_age_threshold=0.80,
    p_sex_threshold=0.80,
    categorical_split_variable=None,
    ignore_demographics=False,
    verbose=0,
):
    """
    Performs a single split for each label independently on the subject level.
    The train folder will contain two lists per diagnosis (baseline and longitudinal),
    whereas the test folder will only include the list of baseline sessions.

    The age and sex distributions between the two sets must be non-significant (according to T-test and chi-square).

    Args:
        formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels.
        n_test (float):
            If >= 1, number of subjects to put in set with name 'subset_name'.
            If < 1, proportion of subjects to put in set with name 'subset_name'.
            If 0, no training set is created and the whole dataset is considered as one set with name 'subset_name'.
        subset_name (str): Name of the subset that is complementary to train.
        MCI_sub_categories (bool): If True, manages MCI sub-categories to avoid data leakage.
        p_age_threshold (float): The threshold used for the T-test on age distributions.
        p_sex_threshold (float): The threshold used for the T-test on sex distributions.
        categorical_split_variable (str): name of a categorical variable to perform a stratified split.
        ignore_demographics (bool): If True the diagnoses are split without taking into account the demographics
            distributions (age, sex).
        verbose (int): level of verbosity.

    Returns:
        writes three files per <label>.tsv file present in formatted_data_path:
            - formatted_data_path/train/<label>.tsv
            - formatted_data_path/train/<label>_baseline.tsv
            - formatted_data_path/<subset_name>/<label>_baseline.tsv
    """
    commandline_to_json(
        {
            "output_dir": formatted_data_path,
            "n_test": n_test,
            "subset_name": subset_name,
            "MCI_sub_categories": MCI_sub_categories,
            "p_age_threshold": p_age_threshold,
            "p_sex_threshold": p_sex_threshold,
            "categorical_split_variable": categorical_split_variable,
            "ignore_demographics": ignore_demographics,
        },
        filename="split.json",
    )

    # Read files
    results_path = formatted_data_path

    train_path = path.join(results_path, "train")
    if path.exists(train_path):
        shutil.rmtree(train_path)
    if n_test > 0:
        os.makedirs(train_path)

    if categorical_split_variable is None:
        categorical_split_variable = "diagnosis"

    test_path = path.join(results_path, subset_name)
    if path.exists(test_path):
        shutil.rmtree(test_path)
    os.makedirs(test_path)

    diagnosis_df_paths = os.listdir(results_path)
    diagnosis_df_paths = [x for x in diagnosis_df_paths if x.endswith(".tsv")]
    diagnosis_df_paths = [
        x for x in diagnosis_df_paths if not x.endswith("_baseline.tsv")
    ]

    MCI_special_treatment = False

    if "MCI.tsv" in diagnosis_df_paths and n_test > 0:
        if MCI_sub_categories:
            diagnosis_df_paths.remove("MCI.tsv")
            MCI_special_treatment = True
        elif "sMCI.tsv" in diagnosis_df_paths or "pMCI.tsv" in diagnosis_df_paths:
            logger.warning(
                "MCI special treatment was deactivated though MCI subgroups were found."
                "Be aware that it may cause data leakage in transfer learning tasks."
            )

    # The baseline session must be kept before or we are taking all the sessions to mix them
    for diagnosis_df_path in diagnosis_df_paths:
        diagnosis_df = pd.read_csv(path.join(results_path, diagnosis_df_path),
                                   sep="\t")
        interest_columns = diagnosis_df.columns.values
        diagnosis = diagnosis_df_path.split(".")[0]
        logger.info(f"Running split for diagnosis {diagnosis}")
        if n_test > 0:
            train_df, test_df = create_split(
                diagnosis,
                diagnosis_df,
                categorical_split_variable,
                n_test=n_test,
                p_age_threshold=p_age_threshold,
                p_sex_threshold=p_sex_threshold,
                ignore_demographics=ignore_demographics,
            )
            # Save baseline splits
            train_df.to_csv(
                path.join(train_path, f"{diagnosis}_baseline.tsv"),
                sep="\t",
                index=False,
            )
            test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"),
                           sep="\t",
                           index=False)

            long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
            long_train_df.to_csv(path.join(train_path, f"{diagnosis}.tsv"),
                                 sep="\t",
                                 index=False)
            long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
            long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"),
                                sep="\t",
                                index=False)

        else:
            baseline_df = extract_baseline(diagnosis_df)
            test_df = baseline_df[interest_columns]
            test_df.to_csv(path.join(test_path, f"{diagnosis}_baseline.tsv"),
                           sep="\t",
                           index=False)
            long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
            long_test_df.to_csv(path.join(test_path, f"{diagnosis}.tsv"),
                                sep="\t",
                                index=False)

    if MCI_special_treatment:

        # Extraction of MCI subjects without intersection with the sMCI / pMCI train
        diagnosis_df = pd.read_csv(path.join(results_path, "MCI.tsv"),
                                   sep="\t")
        MCI_df = diagnosis_df.set_index(["participant_id", "session_id"])
        baseline_df = extract_baseline(MCI_df, set_index=False)

        if n_test > 1:
            n_test = int(n_test)
        else:
            n_test = int(n_test * len(baseline_df))

        MCI_df, supplementary_diagnoses = remove_sub_labels(
            MCI_df, ["sMCI", "pMCI"], diagnosis_df_paths, results_path)
        if len(supplementary_diagnoses) == 0:
            raise ClinicaDLArgumentError(
                "The MCI_sub_categories flag is not needed as there are no intersections with"
                "MCI subcategories.")

        # Construction of supplementary train
        supplementary_train_df = pd.DataFrame()
        for diagnosis in supplementary_diagnoses:
            sup_baseline_train_df = pd.read_csv(path.join(
                train_path, f"{diagnosis}_baseline.tsv"),
                                                sep="\t")
            supplementary_train_df = pd.concat(
                [supplementary_train_df, sup_baseline_train_df])
            sub_df = (supplementary_train_df.reset_index().groupby(
                "participant_id")["session_id"].nunique())
            logger.debug(
                f"supplementary_train_df {len(sub_df)} subjects, {len(supplementary_diagnoses)} scans"
            )

        supplementary_train_df.reset_index(drop=True, inplace=True)

        # MCI selection
        MCI_df.reset_index(inplace=True)
        baseline_df = extract_baseline(MCI_df)

        train_df, test_df = create_split(
            "MCI",
            baseline_df,
            categorical_split_variable,
            n_test=n_test,
            p_age_threshold=p_age_threshold,
            p_sex_threshold=p_sex_threshold,
            ignore_demographics=ignore_demographics,
            supplementary_train_df=supplementary_train_df,
        )

        # Write selection of MCI
        train_df.to_csv(path.join(train_path, "MCI_baseline.tsv"),
                        sep="\t",
                        index=False)
        test_df.to_csv(path.join(test_path, "MCI_baseline.tsv"),
                       sep="\t",
                       index=False)

        long_train_df = retrieve_longitudinal(train_df, diagnosis_df)
        long_train_df.to_csv(path.join(train_path, "MCI.tsv"),
                             sep="\t",
                             index=False)
        long_test_df = retrieve_longitudinal(test_df, diagnosis_df)
        long_test_df.to_csv(path.join(test_path, "MCI.tsv"),
                            sep="\t",
                            index=False)

コード例 #5

ファイルを表示

ファイル: generate.py プロジェクト: mdiazmel/AD-DL

def generate_trivial_dataset(
    caps_directory,
    output_dir,
    n_subjects,
    tsv_path=None,
    preprocessing="t1-linear",
    mask_path=None,
    atrophy_percent=60,
    multi_cohort=False,
):
    """
    Generates a fully separable dataset.

    Generates a dataset, based on the images of the CAPS directory, where a
    half of the image is processed using a mask to oclude a specific region.
    This procedure creates a dataset fully separable (images with half-right
    processed and image with half-left processed)

    Args:
        caps_directory: (str) path to the CAPS directory.
        output_dir: (str) folder containing the synthetic dataset in CAPS format.
        n_subjects: (int) number of subjects in each class of the synthetic
            dataset.
        tsv_path: (str) path to tsv file of list of subjects/sessions.
        preprocessing: (str) preprocessing performed. Must be in ['linear', 'extensive'].
        mask_path: (str) path to the extracted masks to generate the two labels.
        atrophy_percent: (float) percentage of atrophy applied.
        multi_cohort (bool): If True caps_directory is the path to a TSV file linking cohort names and paths.

    Returns:
        Folder structure where images are stored in CAPS format.

    Raises:
        ValueError: if `n_subjects` is higher than the length of the TSV file at `tsv_path`.
    """
    from pathlib import Path

    commandline_to_json({
        "output_dir": output_dir,
        "caps_dir": caps_directory,
        "preprocessing": preprocessing,
        "n_subjects": n_subjects,
        "atrophy_percent": atrophy_percent,
    })

    # Transform caps_directory in dict
    caps_dict = CapsDataset.create_caps_dict(caps_directory,
                                             multi_cohort=multi_cohort)

    # Read DataFrame
    data_df = load_and_check_tsv(tsv_path, caps_dict, output_dir)
    data_df = extract_baseline(data_df)

    home = str(Path.home())
    cache_clinicadl = join(home, ".cache", "clinicadl", "ressources", "masks")
    url_aramis = "https://aramislab.paris.inria.fr/files/data/masks/"
    FILE1 = RemoteFileStructure(
        filename="AAL2.tar.gz",
        url=url_aramis,
        checksum=
        "89427970921674792481bffd2de095c8fbf49509d615e7e09e4bc6f0e0564471",
    )
    makedirs(cache_clinicadl, exist_ok=True)

    if n_subjects > len(data_df):
        raise ValueError(
            f"The number of subjects {n_subjects} cannot be higher "
            f"than the number of subjects in the baseline dataset of size {len(data_df)}"
        )

    if mask_path is None:
        if not exists(join(cache_clinicadl, "AAL2")):
            try:
                print("Try to download AAL2 masks")
                mask_path_tar = fetch_file(FILE1, cache_clinicadl)
                tar_file = tarfile.open(mask_path_tar)
                print("File: " + mask_path_tar)
                try:
                    tar_file.extractall(cache_clinicadl)
                    tar_file.close()
                    mask_path = join(cache_clinicadl, "AAL2")
                except RuntimeError:
                    print("Unable to extract downloaded files")
            except IOError as err:
                print("Unable to download required templates:", err)
                raise ValueError(
                    """Unable to download masks, please download them
                                  manually at https://aramislab.paris.inria.fr/files/data/masks/
                                  and provide a valid path.""")
        else:
            mask_path = join(cache_clinicadl, "AAL2")

    # Create subjects dir
    makedirs(join(output_dir, "subjects"), exist_ok=True)

    # Output tsv file
    columns = ["participant_id", "session_id", "diagnosis", "age_bl", "sex"]
    output_df = pd.DataFrame(columns=columns)
    diagnosis_list = ["AD", "CN"]

    for i in range(2 * n_subjects):
        data_idx = i // 2
        label = i % 2

        participant_id = data_df.loc[data_idx, "participant_id"]
        session_id = data_df.loc[data_idx, "session_id"]
        cohort = data_df.loc[data_idx, "cohort"]
        filename = f"sub-TRIV{i}_ses-M00" + FILENAME_TYPE["cropped"] + ".nii.gz"
        path_image = join(output_dir, "subjects", f"sub-TRIV{i}", "ses-M00",
                          "t1_linear")

        makedirs(path_image, exist_ok=True)

        image_path = find_image_path(caps_dict, participant_id, session_id,
                                     cohort, preprocessing)
        image_nii = nib.load(image_path)
        image = image_nii.get_data()

        atlas_to_mask = nib.load(join(mask_path,
                                      f"mask-{label + 1}.nii")).get_data()

        # Create atrophied image
        trivial_image = im_loss_roi_gaussian_distribution(
            image, atlas_to_mask, atrophy_percent)
        trivial_image_nii = nib.Nifti1Image(trivial_image,
                                            affine=image_nii.affine)
        trivial_image_nii.to_filename(join(path_image, filename))

        # Append row to output tsv
        row = [f"sub-TRIV{i}", "ses-M00", diagnosis_list[label], 60, "F"]
        row_df = pd.DataFrame([row], columns=columns)
        output_df = output_df.append(row_df)

    output_df.to_csv(join(output_dir, "data.tsv"), sep="\t", index=False)

    missing_path = join(output_dir, "missing_mods")
    makedirs(missing_path, exist_ok=True)

    sessions = output_df.session_id.unique()
    for session in sessions:
        session_df = output_df[output_df.session_id == session]
        out_df = copy(session_df[["participant_id"]])
        out_df["synthetic"] = [1] * len(out_df)
        out_df.to_csv(join(missing_path, f"missing_mods_{session}.tsv"),
                      sep="\t",
                      index=False)