Ejemplo n.º 1
def create_disease_cohorts(output_path, metadata_df, exclude_other_diseases=False):
    disease_list = ["acid_reflux", "add_adhd", "asd", "autoimmune", "cancer", "cardiovascular_disease", 
                "depression_bipolar_schizophrenia", "fungal_overgrowth", "ibd", "ibs", "liver_disease",
                "lung_disease", "mental_illness", "mental_illness_type_depression", "migraine", "sibo", 
                "skin_condition", "thyroid", "asd", "ibd", "cdiff", "mental_illness_type_ptsd_posttraumatic_stress_disorder", 
                "skin_condition", "alzheimers", "epilepsy_or_seizure_disorder", "pku"]
    for disease in disease_list:
        if exclude_other_diseases:
            remove_diseases = list(set(disease_list) - set([disease]))
            disease_df = metadata_df.copy(deep=True)
            for remove in remove_diseases:
                disease_df = disease_df[disease_df[remove] != 1]
        feature_counts = metadata_df[disease].value_counts()
        if 1 not in feature_counts.index:    ## Skip binary variables with only one value (e.x. sampling location)
        n_feature_positive = feature_counts[1]
        n_feature_negative = feature_counts[0]
        if disease == "mental_illness_type_ptsd_posttraumatic_stress_disorder":
        if n_feature_positive < 40:    ## Skip variables with less than 50 positive samples
            print("       Not enough samples", disease, str(n_feature_positive))
            print(disease, str(n_feature_positive), str(n_feature_negative))
            cohort = buildDataSubset(disease_df, disease, 1, 0)
            cohort.to_csv(output_path + str(target_name) + ".csv")
def construct_binary_cohorts(output_path, exclude_diseases=False):
    metadata_df, binary_features = process_metadata_population(
    for feature in binary_features:
        feature_counts = metadata_df[feature].value_counts()
        if 1 not in feature_counts.index:  ## Skip binary variables with only one value (e.x. sampling location)
        n_feature_positive = feature_counts[1]
        if n_feature_positive < 40:  ## Skip variables with less than 50 positive samples
            print("       Not enough samples", feature,
            print(feature, str(n_feature_positive))
            cohort = buildDataSubset(metadata_df, feature, 1, 0)
            cohort.to_csv(output_path + str(feature) + ".csv")
Ejemplo n.º 3
def create_frequency_cohorts(output_path, exclude_diseases=False): 
    metadata_df, _ = process_metadata_population(remove_diseases=exclude_diseases)
    freq_values = ['artificial_sweeteners','exercise_frequency','fermented_plant_frequency','frozen_dessert_frequency',
                   'fruit_frequency', 'high_fat_red_meat_frequency','homecooked_meals_frequency','meat_eggs_frequency','milk_cheese_frequency',
                   'probiotic_frequency','ready_to_eat_meals_frequency','red_meat_frequency', 'salted_snacks_frequency', 'seafood_frequency',
                   'smoking_frequency', 'sugar_sweetened_drink_frequency', 'sugary_sweets_frequency',
    file_names = ["_rare_cohort.csv", "_occasional_cohort.csv", "_regular_cohort.csv", "_daily_cohort.csv"]
    combine_threshold = 100 if not exclude_diseases else 100
    for var in freq_values:
        group_counts = metadata_df[var].value_counts()
        # combine never group if size is bellow threshold and combine with occasionally if still too small
        if group_counts[0] <= combine_threshold:
            print("      Never group: ",  group_counts[0], group_counts[0] + group_counts[1])
            metadata_df.loc[metadata_df[var] == 1, var] = 0
            group_counts = metadata_df[var].value_counts()
            if group_counts[0] <= combine_threshold:
                print("      Never group: ",  group_counts[0], group_counts[0] + group_counts[2])
                metadata_df.loc[metadata_df[var] == 2, var] = 0
        # combine daily group if size is bellow threshold and combine with occasionally if still too small
        if group_counts[4] <= combine_threshold:
            print("      Daily group: ",  group_counts[4], group_counts[4] + group_counts[3])
            metadata_df.loc[metadata_df[var] == 3, var] = 4
            group_counts = metadata_df[var].value_counts()
            if group_counts[4] <= combine_threshold:
                print("      Daily group: ",  group_counts[4], group_counts[4] + group_counts[2])
                metadata_df.loc[metadata_df[var] == 2, var] = 4
    for var in freq_values:
        frequency_groups = np.sort(metadata_df[var].unique())
        if 5 in frequency_groups:
            frequency_groups = np.delete(frequency_groups, np.argwhere(frequency_groups == 5))
        control_value = frequency_groups[0]
        case_groups = frequency_groups[1:]
        for group in case_groups:
            if metadata_df[var].value_counts()[group] < 40:
                print("Not constructing " + str(file_names[group -1].split("_")[1]) + " too small")
            print("Constructing " + file_names[group -1].split("_")[1] +  "cohort: " + str(group))
            cohort = buildDataSubset(metadata_df, var, group, control_value)
            cohort.to_csv(output_path + var + file_names[group -1])
Ejemplo n.º 4
print("AGP Healthy Population size: ", len(agp_healthy_population))

healthy_population = metadata_df[(metadata_df["age_years"] >= 20.0) & 
                                     (metadata_df["age_years"] <= 69.0) &
                                     (metadata_df["antibiotic_history"] == "I have not taken antibiotics in the past year.") &
                                     (metadata_df["ibd"] == 0) &
                                     (metadata_df["diabetes"] == 0) &
                                     (metadata_df["bmi"] <= 60.0) &
                                     (metadata_df["bmi"] >= 12.5)]

print("Constructing Obese cohort with matched normal samples")
obese_cohort = buildDataSubset(healthy_population, "bmi_cat", "Obese", "Normal")
obese_cohort.to_csv(dir_path + "Feature_Cohorts/Phase_I_Cohorts/Obese_cohort.csv")

print("Constructing Overweight cohort with matched normal samples")
overweight_cohort = buildDataSubset(healthy_population, "bmi_cat", "Overweight", "Normal")
overweight_cohort.to_csv(dir_path + "Feature_Cohorts/Phase_I_Cohorts/Overweight_cohort.csv")

print("Constructing Underweight cohort with matched normal samples")
underweight_cohort = buildDataSubset(healthy_population, "bmi_cat", "Underweight", "Normal")
underweight_cohort.to_csv(dir_path + "Feature_Cohorts/Phase_I_Cohorts/Underweight_cohort.csv")

healthy_population = metadata_df[(metadata_df["age_years"] >= 20.0) & 
                                     (metadata_df["age_years"] <= 69.0) &
                                     (metadata_df["antibiotic_history"] == "I have not taken antibiotics in the past year.") &
def create_custom_cohorts(save_path, exclude_diseases=False):
    disease_list = [
        "acid_reflux", "add_adhd", "asd", "autoimmune", "cancer",
        "cardiovascular_disease", "depression_bipolar_schizophrenia",
        "fungal_overgrowth", "ibd", "ibs", "liver_disease", "lung_disease",
        "mental_illness", "mental_illness_type_depression", "migraine", "sibo",
        "skin_condition", "thyroid", "asd", "ibd", "cdiff",
        "skin_condition", "alzheimers", "epilepsy_or_seizure_disorder", "pku"
    metadata_df = pd.read_csv(
    for val in ["diabetes", "age_years", "bmi", "ibd", "antibiotic_history"]:
        metadata_df = metadata_df[
            ~metadata_df[val].isin(["Not provided", "Unspecified", 4, 3, 2])]

    metadata_df["bowel_movement_quality"] = metadata_df[
            "I tend to have diarrhea (watery stool)",
            "I tend to have diarrhea (watery stool) - Type 5, 6 and 7"
        ], 0)
    metadata_df["bowel_movement_quality"] = metadata_df[
            "I tend to have normal formed stool",
            "I tend to have normal formed stool - Type 3 and 4"
        ], 1)
    metadata_df["bowel_movement_quality"] = metadata_df[
            "I tend to be constipated (have difficulty passing stool)",
            "I tend to be constipated (have difficulty passing stool) - Type 1 and 2"
        ], 2)

    metadata_df = metadata_df[metadata_df["country"].isin(
        ['USA', 'United Kingdom', 'Canada'])]
    metadata_df["longitude"] = metadata_df["longitude"].replace(
        ["Unspecified", "Not provided"], -10.0)  #.astype(float)
    metadata_df["latitude"] = metadata_df["latitude"].replace(
        ["Unspecified", "Not provided"], -10.0)  #.astype(float)
    metadata_df["race"] = metadata_df["race"].replace(
        ["Unspecified", "Not provided"], "Other")
    metadata_df['sex'] = metadata_df['sex'].replace(
        ["Unspecified", "Not provided", "unspecified"], "other")
    metadata_df['diet_type'] = metadata_df['diet_type'].replace(
        ["Unspecified", "Not provided"], "Other")
    metadata_df.loc[:, [
        "bmi", "age_years", "weight_kg", "longitude", "latitude"
    ]] = metadata_df.loc[:, [
        "bmi", "age_years", "weight_kg", "longitude", "latitude"

    #Samples with missing geographic locations get the centroid of their country or residence
    usa_missing_geo = metadata_df[(metadata_df["longitude"] == -10.0) & (
        metadata_df["country"] == "USA")].index.values
    uk_missing_geo = metadata_df[(metadata_df["longitude"] == -10.0) & (
        metadata_df["country"] == "United Kingdom")].index.values
    can_missing_geo = metadata_df[(metadata_df["longitude"] == -10.0) & (
        metadata_df["country"] == "Canada")].index.values
    metadata_df.loc[usa_missing_geo, ["longitude", "latitude"]] = (-98.6, 39.8)
    metadata_df.loc[uk_missing_geo, ["longitude", "latitude"]] = (-1.5, 52.6)
    metadata_df.loc[can_missing_geo, ["longitude", "latitude"]] = (-79.4, 43.9)
    metadata_df = metadata_df.loc[metadata_df["bmi"] <= 60, :]
    # Age
    metadata_df_healthy = metadata_df[
        (metadata_df["ibd"] == 0) & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["diabetes"] == 0) & (metadata_df["bmi"] >= 12.5) &
        (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]
    print("Constructing child [6:18] cohort with matched healthy adults")
    metadata_df_healthy_young = metadata_df_healthy[
        metadata_df_healthy["age_years"] <= 80.0]
    metadata_df_healthy_young["age_group"] = metadata_df_healthy_young[
    young = metadata_df_healthy_young[
        (metadata_df_healthy_young["age_years"] <= 18.0)
        & (metadata_df_healthy_young["age_years"] >= 6.0)].index
    old = metadata_df_healthy_young[
        metadata_df_healthy_young["age_years"] >= 20.0].index
    metadata_df_healthy_young.loc[young, "age_group"] = 1
    metadata_df_healthy_young.loc[old, "age_group"] = 0
    young_cohort = buildDataSubset(metadata_df_healthy_young, "age_group", 1,
                                   0, None)
    if len(young_cohort) >= 80:
        young_cohort.to_csv(save_path + "age_6-18.csv")

    print("Constructing toddler [3:5] cohort with matched healthy adults")
    metadata_df_healthy_toddler = metadata_df_healthy[
        metadata_df_healthy["age_years"] <= 80.0]
    metadata_df_healthy_toddler["age_group"] = metadata_df_healthy_toddler[
    infant = metadata_df_healthy_toddler[
        (metadata_df_healthy_toddler["age_years"] <= 5.0)
        & (metadata_df_healthy_toddler["age_years"] >= 3.0)].index
    old = metadata_df_healthy_toddler[
        metadata_df_healthy_toddler["age_years"] >= 20.0].index
    metadata_df_healthy_toddler.loc[infant, "age_group"] = 1
    metadata_df_healthy_toddler.loc[old, "age_group"] = 0
    infant_cohort = buildDataSubset(metadata_df_healthy_toddler, "age_group",
                                    1, 0, None)
    if len(infant_cohort) >= 80:
        infant_cohort.to_csv(save_path + "age_3-5.csv")

    print("Constructing 2 and under cohort with matched healthy adults")
    metadata_df_healthy_infant = metadata_df_healthy[
        metadata_df_healthy["age_years"] <= 80.0]
    metadata_df_healthy_infant["age_group"] = metadata_df_healthy_infant[
    infant = metadata_df_healthy_infant[
        metadata_df_healthy_infant["age_years"] <= 2.0].index
    old = metadata_df_healthy_infant[
        metadata_df_healthy_infant["age_years"] >= 20.0].index
    metadata_df_healthy_infant.loc[infant, "age_group"] = 1
    metadata_df_healthy_infant.loc[old, "age_group"] = 0
    infant_cohort = buildDataSubset(metadata_df_healthy_infant, "age_group", 1,
                                    0, None)
    if len(infant_cohort) >= 80:
        infant_cohort.to_csv(save_path + "age_1-2.csv")

    print("Constructing 65 and over cohort with matched healthy adults")
    metadata_df_healthy_old = metadata_df_healthy[
        metadata_df_healthy["age_years"] >= 20.0]
    metadata_df_healthy_old["age_group"] = metadata_df_healthy_old["age_years"]
    young = metadata_df_healthy_old[
        metadata_df_healthy_old["age_years"] < 70.0].index
    old = metadata_df_healthy_old[
        metadata_df_healthy_old["age_years"] >= 70.0].index
    metadata_df_healthy_old.loc[young, "age_group"] = 0
    metadata_df_healthy_old.loc[old, "age_group"] = 1
    old_cohort = buildDataSubset(metadata_df_healthy_old, "age_group", 1, 0,
    if len(old_cohort) >= 80:
        old_cohort.to_csv(save_path + "age_over70.csv")

    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["diabetes"] == 0) &
        (metadata_df["bmi"] >= 12.5) & (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]
    print("Constructing Obese cohort with matched normal samples")
    obese_cohort = buildDataSubset(metadata_df_healthy, "bmi_cat", "Obese",
                                   "Normal", None)
    if len(obese_cohort) >= 80:
        obese_cohort.to_csv(save_path + "Obese_cohort.csv")
    print("Constructing Overweight cohort with matched normal samples")
    overweight_cohort = buildDataSubset(metadata_df_healthy, "bmi_cat",
                                        "Overweight", "Normal", None)
    if len(overweight_cohort) >= 80:
        overweight_cohort.to_csv(save_path + "Overweight_cohort.csv")
    print("Constructing Underweight cohort with matched normal samples")
    underweight_cohort = buildDataSubset(metadata_df_healthy, "bmi_cat",
                                         "Underweight", "Normal", None)
    if len(underweight_cohort) >= 80:
        underweight_cohort.to_csv(save_path + "Underweight_cohort.csv")
    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["diabetes"] == 0) & (metadata_df["bmi"] >= 12.5) &
        (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        disease_list_ibd = list(set(disease_list) - set(["ibd"]))
        for disease in disease_list_ibd:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]

    print("Constructing IBD cohort with matched healthy samples")
    ibd_cohort = buildDataSubset(metadata_df_healthy, "ibd", 1, 0, None)
    if len(ibd_cohort) >= 80:
        ibd_cohort.to_csv(save_path + "IBD_cohort.csv")

    ##Antibiotic History
    metadata_df_healthy = metadata_df[(metadata_df["age_years"] >= 20.0)
                                      & (metadata_df["age_years"] <= 80.0) &
                                      (metadata_df["ibd"] == 0) &
                                      (metadata_df["diabetes"] == 0) &
                                      (metadata_df["bmi"] >= 12.5) &
                                      (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]

    print("Constructing Antiobiotic Year cohort with matched healthy samples")
    antiB_cohort1 = buildDataSubset(
        metadata_df_healthy, "antibiotic_history", "Year",
        "I have not taken antibiotics in the past year.", None)
    if len(antiB_cohort1) >= 80:
        antiB_cohort1.to_csv(save_path + "antiB_Year_cohort.csv")

        "Constructing Antiobiotic 6 Months cohort with matched healthy samples"
    antiB_cohort2 = buildDataSubset(
        metadata_df_healthy, "antibiotic_history", "6 months",
        "I have not taken antibiotics in the past year.", None)
    if len(antiB_cohort2) >= 80:
        antiB_cohort2.to_csv(save_path + "antiB_6Month_cohort.csv")

    print("Constructing Antiobiotic Year cohort with matched healthy samples")
    antiB_cohort3 = buildDataSubset(
        metadata_df_healthy, "antibiotic_history", "Month",
        "I have not taken antibiotics in the past year.", None)
    if len(antiB_cohort3) >= 80:
        antiB_cohort3.to_csv(save_path + "antiB_Month_cohort.csv")

    print("Constructing Antiobiotic Week cohort with matched healthy samples")
    antiB_cohort4 = buildDataSubset(
        metadata_df_healthy, "antibiotic_history", "Week",
        "I have not taken antibiotics in the past year.", None)
    if len(antiB_cohort4) >= 80:
        antiB_cohort4.to_csv(save_path + "antiB_Week_cohort.csv")

    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["bmi"] >= 12.5) &
        (metadata_df["bmi"] <= 60.0)]
    if exclude_diseases:
        disease_list_ibd = list(set(disease_list) - set(["diabetes"]))
        for disease in disease_list_ibd:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]

    print("Constructing Diabetes type II cohort with matched healthy samples")
                            isin(["Not provided", "Unspecified"]),
                            "diabetes_type"] = 0
    diabetes_cohort = buildDataSubset(metadata_df_healthy, "diabetes_type",
                                      "Type II diabetes", 0, None)
    if len(diabetes_cohort) >= 80:
        diabetes_cohort.to_csv(save_path + "diabetes_typeII_cohort.csv")

    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["sex"] != "other") &
        (metadata_df["diabetes"] == 0) & (metadata_df["bmi"] >= 12.5) &
        (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]

    gender_healthy_cohort = buildDataSubset(metadata_df_healthy, "sex", "male",
                                            "female", None)
    if len(gender_healthy_cohort) >= 80:
        gender_healthy_cohort.to_csv(save_path + "gender_healthy_cohort.csv")

    ###Contraceptive (just women)
    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["sex"] == "female") &
        (metadata_df["diabetes"] == 0) & (metadata_df["bmi"] >= 12.5) &
        (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]

        "Constructing contraceptive pill cohort with matched healthy samples")
    pill_cohort = buildDataSubset(metadata_df_healthy, "contraceptive",
                                  "Yes, I am taking the \"pill\"", "No", None)
    if len(pill_cohort) >= 80:
        pill_cohort.to_csv(save_path + "pill_cohort.csv")

    print("Constructing IUD cohort with matched healthy samples")
    iud_cohort = buildDataSubset(metadata_df_healthy, "contraceptive",
                                 "Yes, I use a hormonal IUD (Mirena)", "No",
    if len(iud_cohort) >= 80:
        iud_cohort.to_csv(save_path + "iud_cohort.csv")

    ####Autism: (just children)
    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] <= 30.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["diabetes"] == 0) &
        (metadata_df["bmi"] >= 12.5) & (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        disease_list_ibd = list(set(disease_list) - set(["asd"]))
        for disease in disease_list_ibd:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]
    print("Constructing Autism cohort with matched healthy samples (under 30)")
    asd_cohort = buildDataSubset(metadata_df_healthy, "asd", 1, 0, None)
    if len(asd_cohort) >= 80:
        asd_cohort.to_csv(save_path + "asd_cohort.csv")

    ####Standard Exclusion Criteria
    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["diabetes"] == 0) &
        (metadata_df["bmi"] >= 12.5) & (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]
    alc_cohort = buildDataSubset(metadata_df_healthy, "drinks_per_session",
                                 "1", "I don't drink", None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "drinks_1_cohort.csv")
    alc_cohort = buildDataSubset(metadata_df_healthy, "drinks_per_session",
                                 "1-2", "I don't drink", None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "drinks_1_2_cohort.csv")
    alc_cohort = buildDataSubset(metadata_df_healthy, "drinks_per_session",
                                 "2-3", "I don't drink", None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "drinks_2_3_cohort.csv")
    alc_cohort = buildDataSubset(metadata_df_healthy, "drinks_per_session",
                                 "3-4", "I don't drink", None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "drinks_3_4_cohort.csv")
    alc_cohort = buildDataSubset(metadata_df_healthy, "drinks_per_session",
                                 "4+", "I don't drink", None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "drinks_4_plus_cohort.csv")
    ###Alcohol_Consumption Frequeny:
    print("Constructing alcohol daily with matched healthy samples")
    alc_cohort = buildDataSubset(metadata_df_healthy, "alcohol_frequency", 4,
                                 0, None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "alcohol_daily_cohort.csv")
    print("Constructing alcohol rare cohort with matched healthy samples")
    alc_cohort = buildDataSubset(metadata_df_healthy, "alcohol_frequency", 1,
                                 0, None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "alcohol_rare_cohort.csv")
        "Constructing alcohol occasional cohort with matched healthy samples")
    alc_cohort = buildDataSubset(metadata_df_healthy, "alcohol_frequency", 2,
                                 0, None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "alcohol_occasional_cohort.csv")
    print("Constructing alcohol regular cohort with matched healthy samples")
    alc_cohort = buildDataSubset(metadata_df_healthy, "alcohol_frequency", 3,
                                 0, None)
    if len(alc_cohort) >= 80:
        alc_cohort.to_csv(save_path + "alcohol_regular_cohort.csv")
    print("Constructing plants cohort with matched normal samples")
    nplant_6_10_cohort = buildDataSubset(metadata_df_healthy,
                                         "types_of_plants", "6 to 10",
                                         "Less than 5", None)
    if len(nplant_6_10_cohort) >= 80:
        nplant_6_10_cohort.to_csv(save_path + "nplant_6_10_cohort.csv")
    print("Constructing plants cohort with matched normal samples")
    nplant_11_20_cohort = buildDataSubset(metadata_df_healthy,
                                          "types_of_plants", "11 to 20",
                                          "Less than 5", None)
    if len(nplant_11_20_cohort) >= 80:
        nplant_11_20_cohort.to_csv(save_path + "nplant_11_20_cohort.csv")
    print("Constructing plants cohort with matched normal samples")
    nplant_21_30_cohort = buildDataSubset(metadata_df_healthy,
                                          "types_of_plants", "21 to 30",
                                          "Less than 5", None)
    if len(nplant_21_30_cohort) >= 80:
        nplant_21_30_cohort.to_csv(save_path + "nplant_21_30_cohort.csv")
    print("Constructing plants cohort with matched normal samples")
    nplant_30_plus_cohort = buildDataSubset(metadata_df_healthy,
                                            "types_of_plants", "More than 30",
                                            "Less than 5", None)
    if len(nplant_30_plus_cohort) >= 80:
        nplant_30_plus_cohort.to_csv(save_path + "nplant_30_plus_cohort.csv")
    print("Constructing no gluten cohort with matched healthy samples")
    no_gluten_cohort = buildDataSubset(
        metadata_df_healthy, "gluten",
        "I do not eat gluten because it makes me feel bad", "No", None)
    if len(no_gluten_cohort) >= 80:
        no_gluten_cohort.to_csv(save_path + "no_gluten_cohort.csv")
    print("Constructing celiac cohort with matched healthy samples")
    celiac_cohort = buildDataSubset(metadata_df_healthy, "gluten",
                                    "I was diagnosed with celiac disease",
                                    "No", None)
    if len(celiac_cohort) >= 80:
        celiac_cohort.to_csv(save_path + "celiac_cohort.csv")
    print("Constructing gluten_allergy cohort with matched healthy samples")
    gluten_alergy_cohort = buildDataSubset(
        metadata_df_healthy, "gluten",
        "I was diagnosed with gluten allergy (anti-gluten IgG), but not celiac disease",
        "No", None)
    if len(gluten_alergy_cohort) >= 80:
        gluten_alergy_cohort.to_csv(save_path + "gluten_alergy_cohort.csv")
    print("Constructing USA immigrant cohort with matched healthy samples")
    metadata_df_healthy["immigrant"] = metadata_df_healthy["country_of_birth"]
    immigrants_us = metadata_df_healthy[
        (metadata_df_healthy["country_of_birth"] != "United States")
        & (metadata_df_healthy["country_residence"] == "United States")].index
    native_us = metadata_df_healthy[
        (metadata_df_healthy["country_of_birth"] == "United States")
        & (metadata_df_healthy["country_residence"] == "United States")].index
    metadata_df_healthy.loc[immigrants_us, "immigrant"] = 1
    metadata_df_healthy.loc[native_us, "immigrant"] = 0
    usa_immigrant_cohort = buildDataSubset(metadata_df_healthy, "immigrant", 1,
                                           0, None)
    if len(usa_immigrant_cohort) >= 80:
        usa_immigrant_cohort.to_csv(save_path + "usa_immigrant_cohort.csv")
    print("Constructing UK immigrant cohort with matched healthy samples")
    metadata_df_healthy["immigrant"] = metadata_df_healthy["country_of_birth"]
    immigrants_uk = metadata_df_healthy[
        (metadata_df_healthy["country_of_birth"] != "United Kingdom")
        & (metadata_df_healthy["country_residence"] == "United Kingdom")].index
    native_uk = metadata_df_healthy[
        (metadata_df_healthy["country_of_birth"] == "United Kingdom")
        & (metadata_df_healthy["country_residence"] == "United Kingdom")].index
    metadata_df_healthy.loc[immigrants_uk, "immigrant"] = 1
    metadata_df_healthy.loc[native_uk, "immigrant"] = 0
    uk_immigrant_cohort = buildDataSubset(metadata_df_healthy, "immigrant", 1,
                                          0, None)
    if len(uk_immigrant_cohort) >= 80:
        uk_immigrant_cohort.to_csv(save_path + "uk_immigrant_cohort.csv")
    print("Constructing diet:  cohort with matched healthy samples")
    omnivore_noRed_cohort = buildDataSubset(
        metadata_df_healthy, "diet_type", "Omnivore but do not eat red meat",
        "Omnivore", None)
    if len(omnivore_noRed_cohort) >= 80:
        omnivore_noRed_cohort.to_csv(save_path + "omnivore_noRed_cohort.csv")
    print("Constructing diet: Pescatarian cohort with matched healthy samples")
    pescatarian_cohort = buildDataSubset(metadata_df_healthy, "diet_type",
                                         "Vegetarian but eat seafood",
                                         "Omnivore", None)
    if len(pescatarian_cohort) >= 80:
        pescatarian_cohort.to_csv(save_path + "pescatarian_cohort.csv")
    print("Constructing diet: Vegetarian cohort with matched healthy samples")
    vegetarian_cohort = buildDataSubset(metadata_df_healthy, "diet_type",
                                        "Vegetarian", "Omnivore", None)
    if len(vegetarian_cohort) >= 80:
        vegetarian_cohort.to_csv(save_path + "vegetarian_cohort.csv")
    print("Constructing diet: Vegan cohort with matched healthy samples")
    vegan_cohort = buildDataSubset(metadata_df_healthy, "diet_type", "Vegan",
                                   "Omnivore", None)
    if len(vegan_cohort) >= 80:
        vegan_cohort.to_csv(save_path + "vegan_cohort.csv")
    print("Constructing BMF 1 cohort with matched healthy samples")
    bmf_less1_cohort = buildDataSubset(metadata_df_healthy,
                                       "Less than one", "One", None)
    if len(bmf_less1_cohort) >= 80:
        bmf_less1_cohort.to_csv(save_path + "bmf_less1_cohort.csv")
    print("Constructing BMF 2 cohort with matched healthy samples")
    bmf_2_cohort = buildDataSubset(metadata_df_healthy,
                                   "bowel_movement_frequency", "Two", "One",
    if len(bmf_2_cohort) >= 80:
        bmf_2_cohort.to_csv(save_path + "bmf_2_cohort.csv")
    print("Constructing BMF 3 cohort with matched healthy samples")
    bmf_3_cohort = buildDataSubset(metadata_df_healthy,
                                   "bowel_movement_frequency", "Three", "One",
    if len(bmf_3_cohort) >= 80:
        bmf_3_cohort.to_csv(save_path + "bmf_3_cohort.csv")
    print("Constructing BMF 4 cohort with matched healthy samples")
    bmf_4_cohort = buildDataSubset(metadata_df_healthy,
                                   "bowel_movement_frequency", "Four", "One",
    if len(bmf_4_cohort) >= 80:
        bmf_4_cohort.to_csv(save_path + "bmf_4_cohort.csv")
    print("Constructing BMF 5 cohort with matched healthy samples")
    bmf_5_cohort = buildDataSubset(metadata_df_healthy,
                                   "bowel_movement_frequency", "Five or more",
                                   "One", None)
    if len(bmf_5_cohort) >= 80:
        bmf_5_cohort.to_csv(save_path + "bmf_5_cohort.csv")
    print("Constructing BMQ Solid cohort with matched healthy samples")
    bmq_solid_cohort = buildDataSubset(metadata_df_healthy,
                                       "bowel_movement_quality", 2, 1, None)
    if len(bmq_solid_cohort) >= 80:
        bmq_solid_cohort.to_csv(save_path + "bmq_solid_cohort.csv")
    print("Constructing BMQ loose cohort with matched healthy samples")
    bmq_loose_cohort = buildDataSubset(metadata_df_healthy,
                                       "bowel_movement_quality", 0, 1, None)
    if len(bmq_loose_cohort) >= 80:
        bmq_loose_cohort.to_csv(save_path + "bmq_loose_cohort.csv")
    print("Constructing weight increase cohort with matched healthy samples")
    weigth_increase_cohort = buildDataSubset(metadata_df_healthy,
                                             "Increased more than 10 pounds",
                                             "Remained stable", None)
    if len(weigth_increase_cohort) >= 80:
        weigth_increase_cohort.to_csv(save_path + "weigth_increase_cohort.csv")
    print("Constructing weight decrease cohort with matched healthy samples")
    weigth_decrease_cohort = buildDataSubset(metadata_df_healthy,
                                             "Decreased more than 10 pounds",
                                             "Remained stable", None)
    if len(weigth_decrease_cohort) >= 80:
        weigth_decrease_cohort.to_csv(save_path + "weigth_decrease_cohort.csv")
    print("Constructing country cohort with matched healthy samples")
    country_cohort = buildDataSubset(metadata_df_healthy, "country", "USA",
                                     "United Kingdom", None)
    if len(country_cohort) >= 80:
        country_cohort.to_csv(save_path + "country_cohort.csv")

    metadata_df_healthy = metadata_df[
        (metadata_df["age_years"] >= 20.0) & (metadata_df["age_years"] <= 80.0)
        & (metadata_df["antibiotic_history"].isin(
            ["Year", "I have not taken antibiotics in the past year."])) &
        (metadata_df["ibd"] == 0) & (metadata_df["diabetes"] == 0) &
        (metadata_df["bmi"] >= 12.5) & (metadata_df["bmi"] <= 40.0)]
    if exclude_diseases:
        for disease in disease_list:
            metadata_df_healthy = metadata_df_healthy[
                metadata_df_healthy[disease] != 1]
    metadata_df_healthy = metadata_df_healthy[
        isin(["Unspecified", "2-Jan", "3-Feb", "Not provided"])]
    metadata_df_healthy["drinks_per_session"] = metadata_df_healthy[
            "I don't drink": 0,
            "1": 1,
            "1-2": 2,
            "2-3": 3,
            "3-4": 4,
            "4+": 5
    metadata_df_healthy["average_drinks_per_week"] = metadata_df_healthy[
            str) + "_" + metadata_df_healthy["alcohol_frequency"].astype(str)
    drink_map = {
        '0_0': "non_drinker",
        '0_1': "0",
        '0_2': "0",
        '0_4': "0",
        '1_0': "0",
        '1_1': "light_drinker",
        '1_2': "light_drinker",
        '1_3': "mild_drinker",
        '1_4': "mild_drinker",
        '2_0': "0",
        '2_1': "light_drinker",
        '2_2': "light_drinker",
        '2_3': "mild_drinker",
        '2_4': "heavy_drinker",
        '3_0': "0",
        '3_1': "light_drinker",
        '3_2': "mild_drinker",
        '3_3': "heavy_drinker",
        '3_4': "heavy_drinker",
        '4_1': "light_drinker",
        '4_2': "mild_drinker",
        '4_3': "heavy_drinker",
        '4_4': "heavy_drinker",
        '5_1': "light_drinker",
        '5_2': "mild_drinker",
        '5_3': "heavy_drinker",
        '5_4': "heavy_drinker"
    metadata_df_healthy["average_drinks_per_week"] = metadata_df_healthy[

    drinker_type_cohort = buildDataSubset(metadata_df_healthy,
                                          "light_drinker", "non_drinker", None)
    if len(drinker_type_cohort) >= 80:
        drinker_type_cohort.to_csv(save_path + "light_drinker_type_cohort.csv")

    drinker_type_cohort = buildDataSubset(metadata_df_healthy,
                                          "mild_drinker", "non_drinker", None)
    if len(drinker_type_cohort) >= 80:
        drinker_type_cohort.to_csv(save_path + "mild_drinker_type_cohort.csv")

    drinker_type_cohort = buildDataSubset(metadata_df_healthy,
                                          "heavy_drinker", "non_drinker", None)
    if len(drinker_type_cohort) >= 80:
        drinker_type_cohort.to_csv(save_path + "heavy_drinker_type_cohort.csv")