Ejemplo n.º 1
0
def get_days(days_datetime):
    return days_datetime.days


n_components = 10

use_recorded = False

script_dir = sys.path[0]

if not use_recorded:

    OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'ronies_Data',
                                      'saliva_samples_231018.csv'),
                         os.path.join(
                             SCRIPT_DIR, 'ronies_Data',
                             'saliva_samples_mapping_file_231018.csv'),
                         from_QIIME=True)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=6)
    otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data,
                                             n_components=n_components,
                                             visualize=False)
    # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy)
    # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy)
    # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days'])
    # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days
    # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']])
    # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days
Ejemplo n.º 2
0
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import csv
from plot_clustergram import *
csvfile = 'C:/Users/Anna/Documents/xgboost_gvhd_saliva.csv'
otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv'
mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv'
headers = [
    'ms', 'ne', 'learning rate', 'regularization', 'auc test', 'auc train'
]
# with open(csvfile, "w") as output:
#     writer = csv.writer(output, delimiter=',', lineterminator='\n')
#     writer.writerow(headers)

OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
print(OtuMf.otu_file.shape)
preproccessed_data = preprocess_data(OtuMf.otu_file,
                                     visualize_data=False,
                                     taxnomy_level=6)
print(preproccessed_data.shape)
mapping_file = OtuMf.mapping_file
mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE'])
mapping_file['Date_Of_Transplantation'] = pd.to_datetime(
    OtuMf.mapping_file['Date_Of_Transplantation'])
mapping_file['Date_of_engraftmen'] = pd.to_datetime(
    OtuMf.mapping_file['Date_of_engraftmen'])
mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat'])
mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start'])
end = pd.to_datetime('2020-01-01')
mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
Ejemplo n.º 3
0
def allergies(perform_distance=False, level=3):
    otu = 'C:/Users/Anna/Documents/allergy_otu.csv'
    mapping = 'C:/Users/Anna/Documents/allergy_mf.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    preproccessed_data = preproccessed_data.join(
        OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner')
    preproccessed_data = preproccessed_data.loc[
        (preproccessed_data['AllergyType'] == 'Milk') |
        ((preproccessed_data['AllergyType'] == 'Peanut'))]
    preproccessed_data = preproccessed_data.drop(
        ['AllergyType', 'SuccessDescription'], axis=1)
    mapping_file = OtuMf.mapping_file.loc[
        (OtuMf.mapping_file['AllergyType'] == 'Milk') |
        (OtuMf.mapping_file['AllergyType'] == 'Peanut')]
    mapping_disease = {'Milk': 1, 'Peanut': 0}
    mapping_file['AllergyType'] = mapping_file['AllergyType'].map(
        mapping_disease)
    mapping_disease = {'Milk': 1, 'Peanut': 0}
    mapping_file = mapping_file['AllergyType']

    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        for key, values in dict_bact.items():
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
            col += num_comp
        return new_df, mapping_file
    else:
        return preproccessed_data, mapping_file
def gvhd(perform_distance=True,level =3):
    otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv'
    mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file, taxnomy_level=7)
    mapping_file = OtuMf.mapping_file
    mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE'])
    mapping_file['Date_Of_Transplantation'] = pd.to_datetime(OtuMf.mapping_file['Date_Of_Transplantation'])
    mapping_file['Date_of_engraftmen'] = pd.to_datetime(OtuMf.mapping_file['Date_of_engraftmen'])
    mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat'])
    mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start'])
    end = pd.to_datetime('2020-01-01')
    mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end)
    mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end)
    mapping_file = mapping_file[(mapping_file['DATE'] > mapping_file['Date_Of_Transplantation']) & (
                mapping_file['DATE'] < mapping_file['aGVHD1_Stat']) & (mapping_file['DATE'] < mapping_file['cGVHD_start'])].sort_values(['Personal_ID', 'DATE'])

    mapping_file = mapping_file.reset_index()
    mapping_file = mapping_file.sort_values("DATE").groupby("Personal_ID", as_index=False).last().set_index('#SampleID')

    mapping_file = mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']]
    mapping_file = mapping_file.fillna('No')
    # preproccessed_data = preproccessed_data.join(mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']], how='inner')
    # preproccessed_data = preproccessed_data.fillna('No')

    mapping_yes_no = {'Yes': 1, 'No': 0}
    mapping_file['aGVHD1 '] = mapping_file['aGVHD1 '].map(mapping_yes_no)
    mapping_file['cGVHD '] = mapping_file['cGVHD '].map(mapping_yes_no)
    mapping_file['MTX'] = mapping_file['MTX'].map(mapping_yes_no)
    mapping_file["disease"] = mapping_file["aGVHD1 "].map(str) + '_' + mapping_file["cGVHD "].map(str)
    mapping_diseases = {'0_0': 1, '1_0': 0, '0_1': 0, '1_1': 0}
    mapping_file["disease"] = mapping_file["disease"].map(mapping_diseases)
    mapping_file = mapping_file.drop(['aGVHD1 ', 'cGVHD '], axis=1)
    preproccessed_data = preproccessed_data.join(mapping_file, how='inner')
    preproccessed_data = preproccessed_data.drop(['MTX','disease'], axis =1)
    if perform_distance:
        cols = [col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [preproccessed_data[col].name]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        for key, values in dict_bact.items():
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
            col += num_comp
        return new_df, mapping_file
    else:
        return preproccessed_data, mapping_file
def ibd(perform_distance=True, level=3):
    otu = 'C:/Users/Anna/Documents/otu_IBD3.csv'
    mapping = 'C:/Users/Anna/Documents/mapping_IBD3.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    preproccessed_data = preproccessed_data.join(
        OtuMf.mapping_file[['CD_or_UC', 'preg_trimester', 'P-ID']],
        how='inner')
    preproccessed_data = preproccessed_data.loc[(preproccessed_data['CD_or_UC']
                                                 != 'control')]
    preproccessed_data = preproccessed_data.groupby(
        ['CD_or_UC', 'preg_trimester', 'P-ID'], as_index=False).mean()
    new_set2 = preproccessed_data.groupby(['preg_trimester']).mean()
    for i in range(0, len(preproccessed_data)):
        month = preproccessed_data['preg_trimester'][i]
        preproccessed_data.iloc[i:i + 1, 3:preproccessed_data.shape[1]] = (
            preproccessed_data.iloc[i:i + 1,
                                    3:preproccessed_data.shape[1]].values -
            new_set2.loc[month:month, :].values)

    preproccessed_data = preproccessed_data.drop(
        ['preg_trimester', 'P-ID', 'CD_or_UC'], axis=1)
    mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['CD_or_UC'] !=
                                           'control')]
    mapping_disease = {'CD': 1, 'UC': 0}
    mapping_file['CD_or_UC'] = mapping_file['CD_or_UC'].map(mapping_disease)
    mapping_file = mapping_file['CD_or_UC']
    mapping_file = mapping_file.reset_index()
    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                #if ",".join(col_name[0:bact_level+1]) in dict_bact:
                #    dict_bact[",".join(col_name[0:bact_level+1])].append(preproccessed_data[col].name)
                #else:
                #    dict_bact[",".join(col_name[0:bact_level+1])] = [preproccessed_data[col].name]
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]

            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        new_dict = {}
        for key, values in dict_bact.items():
            new_dict[key] = []
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
                new_dict[key].append(col + j)
            col += num_comp
        return new_df, mapping_file, new_dict, OtuMf.otu_file.T[
            'taxonomy'].values
    else:
        return preproccessed_data, mapping_file, {}
def psc(perform_distance=True, level=3):
    otu = 'C:/Users/Anna/Desktop/docs/otu_psc2.csv'
    mapping = 'C:/Users/Anna/Desktop/docs/mapping_psc.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    print('using padp')
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    mapping_file = OtuMf.mapping_file

    mapping_disease = {
        'Control': 0,
        'Cirrhosis ': 1,
        'HCC': 1,
        'PSC+IBD': 2,
        'PSC': 2
    }
    mapping_file['DiagnosisGroup'] = mapping_file['DiagnosisGroup'].map(
        mapping_disease)
    mappin_boolean = {'yes': 1, 'no': 0, 'Control': 0, '0': 0, '1': 1}
    mapping_file['FattyLiver'] = mapping_file['FattyLiver'].map(mappin_boolean)
    mapping_file['RegularExercise'] = mapping_file['RegularExercise'].map(
        mappin_boolean)
    mapping_file['Smoking'] = mapping_file['Smoking'].map(mappin_boolean)
    mapping_file = mapping_file[[
        'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking',
        'DiagnosisGroup'
    ]]

    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if len(preproccessed_data[col].unique()) != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            print(col_name[-1])

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        for key, values in dict_bact.items():
            new_data = preproccessed_data[values]
            pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
            pca.fit(new_data)
            sum = 0
            num_comp = 0
            for (i, component) in enumerate(pca.explained_variance_ratio_):
                if sum <= 0.5:
                    sum += component
                else:
                    num_comp = i
                    break
            if num_comp == 0:
                num_comp += 1
            otu_after_pca_new, pca_components = apply_pca(
                new_data, n_components=num_comp)
            for j in range(otu_after_pca_new.shape[1]):
                new_df[col + j] = otu_after_pca_new[j]
            col += num_comp
        return new_df, mapping_file
    else:
        return preproccessed_data, mapping_file
def allergies(perform_distance=False, level=3):
    otu = 'allergy_otu.csv'
    mapping = 'allergy_mf.csv'
    OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=7)
    preproccessed_data = preproccessed_data.join(
        OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner')
    #preproccessed_data = preproccessed_data.loc[(preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))]
    preproccessed_data = preproccessed_data.drop(
        ['AllergyType', 'SuccessDescription'], axis=1)
    #mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType']  == 'Milk') | (OtuMf.mapping_file['AllergyType']  == 'Peanut')]
    mapping_file = OtuMf.mapping_file
    mapping_disease = {'Milk': 1, 'Peanut': 0}
    mapping_health = {'Con': 1}
    mapping_success = {'A1': 1}
    mapping_file['Health'] = mapping_file['AllergyType'].map(mapping_health)
    mapping_file['AllergyType'] = mapping_file['AllergyType'].map(
        mapping_disease)
    mapping_file['SuccessDescription'] = mapping_file[
        'SuccessDescription'].map(mapping_success)
    mapping_file[['Health', 'SuccessDescription'
                  ]] = mapping_file[['Health',
                                     'SuccessDescription']].fillna(value=0)

    mapping_file = mapping_file[['AllergyType', 'SuccessDescription']]

    # if perform_distance:
    #     cols = [col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1]
    #     dict_bact = {'else': []}
    #     for col in preproccessed_data[cols]:
    #         col_name = preproccessed_data[col].name.split(';')
    #         bact_level = level - 1
    #         if len(col_name) > bact_level:
    #             if col_name[bact_level] in dict_bact:
    #                 dict_bact[col_name[bact_level]].append(preproccessed_data[col].name)
    #             else:
    #                 dict_bact[col_name[bact_level]] = [preproccessed_data[col].name]
    #         else:
    #             dict_bact['else'].append(preproccessed_data[col].name)
    #         print(col_name[-1])
    #
    #     new_df = pd.DataFrame(index=preproccessed_data.index)
    #     col = 0
    #     for key, values in dict_bact.items():
    #         new_data = preproccessed_data[values]
    #         pca = PCA(n_components=round(new_data.shape[1] / 2) + 1)
    #         pca.fit(new_data)
    #         sum = 0
    #         num_comp = 0
    #         for (i, component) in enumerate(pca.explained_variance_ratio_):
    #             if sum <= 0.5:
    #                 sum += component
    #             else:
    #                 num_comp = i
    #                 break
    #         if num_comp == 0:
    #             num_comp += 1
    #         otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp)
    #         for j in range(otu_after_pca_new.shape[1]):
    #             new_df[col + j] = otu_after_pca_new[j]
    #         col += num_comp
    #     return new_df, mapping_file
    # else:
    return preproccessed_data, mapping_file