import math import seaborn as sns; sns.set(color_codes=True) import operator from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold,LeaveOneOut, KFold from sklearn.ensemble import RandomForestClassifier from sklearn import metrics, svm from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv' mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv' max_num_of_pcas = 20 OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, taxnomy_level=7) mapping_file = OtuMf.mapping_file mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE']) mapping_file['Date_Of_Transplantation'] = pd.to_datetime(OtuMf.mapping_file['Date_Of_Transplantation']) mapping_file['Date_of_engraftmen'] = pd.to_datetime(OtuMf.mapping_file['Date_of_engraftmen']) mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat']) mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start']) end = pd.to_datetime('2020-01-01') mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end) mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end) mapping_file = mapping_file[(mapping_file['DATE']>mapping_file['Date_Of_Transplantation']) & (mapping_file['DATE']<mapping_file['aGVHD1_Stat']) & (mapping_file['DATE']<mapping_file['cGVHD_start'])].sort_values(['Personal_ID', 'DATE']) mapping_file = mapping_file.reset_index() mapping_file = mapping_file.sort_values("DATE").groupby("Personal_ID", as_index=False).last().set_index('#SampleID') preproccessed_data = preproccessed_data.join(mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']], how ='inner') preproccessed_data = preproccessed_data.fillna('No')
import csv from plot_clustergram import * csvfile = 'C:/Users/Anna/Documents/xgboost_gvhd_saliva.csv' otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv' mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv' headers = [ 'ms', 'ne', 'learning rate', 'regularization', 'auc test', 'auc train' ] # with open(csvfile, "w") as output: # writer = csv.writer(output, delimiter=',', lineterminator='\n') # writer.writerow(headers) OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) print(OtuMf.otu_file.shape) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=6) print(preproccessed_data.shape) mapping_file = OtuMf.mapping_file mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE']) mapping_file['Date_Of_Transplantation'] = pd.to_datetime( OtuMf.mapping_file['Date_Of_Transplantation']) mapping_file['Date_of_engraftmen'] = pd.to_datetime( OtuMf.mapping_file['Date_of_engraftmen']) mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat']) mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start']) end = pd.to_datetime('2020-01-01') mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end) mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end) mapping_file = mapping_file[ (mapping_file['DATE'] > mapping_file['Date_Of_Transplantation'])
def ibd(perform_distance=True, level=3): otu = 'C:/Users/Anna/Documents/otu_IBD3.csv' mapping = 'C:/Users/Anna/Documents/mapping_IBD3.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) preproccessed_data = preproccessed_data.join( OtuMf.mapping_file[['CD_or_UC', 'preg_trimester', 'P-ID']], how='inner') preproccessed_data = preproccessed_data.loc[(preproccessed_data['CD_or_UC'] != 'control')] preproccessed_data = preproccessed_data.groupby( ['CD_or_UC', 'preg_trimester', 'P-ID'], as_index=False).mean() new_set2 = preproccessed_data.groupby(['preg_trimester']).mean() for i in range(0, len(preproccessed_data)): month = preproccessed_data['preg_trimester'][i] preproccessed_data.iloc[i:i + 1, 3:preproccessed_data.shape[1]] = ( preproccessed_data.iloc[i:i + 1, 3:preproccessed_data.shape[1]].values - new_set2.loc[month:month, :].values) preproccessed_data = preproccessed_data.drop( ['preg_trimester', 'P-ID', 'CD_or_UC'], axis=1) mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['CD_or_UC'] != 'control')] mapping_disease = {'CD': 1, 'UC': 0} mapping_file['CD_or_UC'] = mapping_file['CD_or_UC'].map(mapping_disease) mapping_file = mapping_file['CD_or_UC'] mapping_file = mapping_file.reset_index() if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: #if ",".join(col_name[0:bact_level+1]) in dict_bact: # dict_bact[",".join(col_name[0:bact_level+1])].append(preproccessed_data[col].name) #else: # dict_bact[",".join(col_name[0:bact_level+1])] = [preproccessed_data[col].name] if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 new_dict = {} for key, values in dict_bact.items(): new_dict[key] = [] new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] new_dict[key].append(col + j) col += num_comp return new_df, mapping_file, new_dict, OtuMf.otu_file.T[ 'taxonomy'].values else: return preproccessed_data, mapping_file, {}
def allergies(perform_distance=False, level=3): otu = 'allergy_otu.csv' mapping = 'allergy_mf.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) preproccessed_data = preproccessed_data.join( OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner') preproccessed_data = preproccessed_data.loc[ (preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))] preproccessed_data = preproccessed_data.drop( ['AllergyType', 'SuccessDescription'], axis=1) mapping_file = OtuMf.mapping_file.loc[ (OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')] mapping_disease = {'Milk': 1, 'Peanut': 0} mapping_file['AllergyType'] = mapping_file['AllergyType'].map( mapping_disease) mapping_file = mapping_file['AllergyType'] if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 new_dict = {} for key, values in dict_bact.items(): new_dict[key] = [] new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] new_dict[key].append(col + j) col += num_comp return new_df, mapping_file, new_dict, OtuMf.otu_file.T[ 'taxonomy'].values else: return preproccessed_data, mapping_file, {}
def gvhd(perform_distance=True, level=3): otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv' mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, taxnomy_level=7) mapping_file = OtuMf.mapping_file mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE']) mapping_file['Date_Of_Transplantation'] = pd.to_datetime( OtuMf.mapping_file['Date_Of_Transplantation']) mapping_file['Date_of_engraftmen'] = pd.to_datetime( OtuMf.mapping_file['Date_of_engraftmen']) mapping_file['aGVHD1_Stat'] = pd.to_datetime( OtuMf.mapping_file['aGVHD1_Stat']) mapping_file['cGVHD_start'] = pd.to_datetime( OtuMf.mapping_file['cGVHD_start']) end = pd.to_datetime('2020-01-01') mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end) mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end) mapping_file = mapping_file[ (mapping_file['DATE'] > mapping_file['Date_Of_Transplantation']) & (mapping_file['DATE'] < mapping_file['aGVHD1_Stat']) & (mapping_file['DATE'] < mapping_file['cGVHD_start'])].sort_values( ['Personal_ID', 'DATE']) mapping_file = mapping_file.reset_index() mapping_file = mapping_file.sort_values("DATE").groupby( "Personal_ID", as_index=False).last().set_index('#SampleID') mapping_file = mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']] mapping_file = mapping_file.fillna('No') # preproccessed_data = preproccessed_data.join(mapping_file[['MTX', 'aGVHD1 ', 'cGVHD ']], how='inner') # preproccessed_data = preproccessed_data.fillna('No') mapping_yes_no = {'Yes': 1, 'No': 0} mapping_file['aGVHD1 '] = mapping_file['aGVHD1 '].map(mapping_yes_no) mapping_file['cGVHD '] = mapping_file['cGVHD '].map(mapping_yes_no) mapping_file['MTX'] = mapping_file['MTX'].map(mapping_yes_no) mapping_file["disease"] = mapping_file["aGVHD1 "].map( str) + '_' + mapping_file["cGVHD "].map(str) mapping_diseases = {'0_0': 1, '1_0': 0, '0_1': 0, '1_1': 0} mapping_file["disease"] = mapping_file["disease"].map(mapping_diseases) mapping_file = mapping_file.drop(['aGVHD1 ', 'cGVHD '], axis=1) preproccessed_data = preproccessed_data.join(mapping_file, how='inner') preproccessed_data = preproccessed_data.drop(['MTX', 'disease'], axis=1) if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 new_dict = {} for key, values in dict_bact.items(): new_dict[key] = [] new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] new_dict[key].append(col + j) col += num_comp return new_df, mapping_file, new_dict, OtuMf.otu_file.T[ 'taxonomy'].values else: return preproccessed_data, mapping_file, {}, []
def microbiome_preprocess(max_pca, tax_list, tag_list, old_preprocess=True, rho_pca_plots=False, evaluate=False, algo="svm", method="fold"): for tax in tax_list: for tag in tag_list: if old_preprocess: otu_file = "otu_id.csv" tag_file = tag + "_tag.csv" OtuMf = OtuMfHandler(otu_file, tag_file, from_QIIME=False, id_col='ID', taxonomy_col='taxonomy') preproccessed_data = preprocess_data( OtuMf.otu_file, preform_z_scoring=True, visualize_data=False, taxnomy_level=tax, preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=max_pca, visualize=False) folder = tag + "_tax_" + str(tax) + "_csv_files" otu_name = "old_processed_otu_" + tag + "_tax_" + str( tax) + ".csv" otu_after_pca_wo_taxonomy[ "ID"] = otu_after_pca_wo_taxonomy.index otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.set_index( "ID") if not os.path.exists(folder): os.mkdir(folder) otu_after_pca_wo_taxonomy.to_csv(os.path.join( folder, otu_name)) else: # yoel new Preprocess # parameters for Preprocess preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'mean', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': max_pca } mapping_file = CreateOtuAndMappingFiles( "otu.csv", tag + "_tag.csv") mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False) if rho_pca_plots: folder = "preprocess_plots_" + tag + "_tag_tax_" + str( tax) + "_pca_" + str(max_pca) mapping_file.rhos_and_pca_calculation( tag, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], os.path.join(folder, "rhos"), os.path.join(folder, "pca")) otu_path, tag_path, pca_path = mapping_file.csv_to_learn( tag + '_task', tag + "_tax_" + str(tax) + "_csv_files", tax, max_pca) print(otu_path) # compere tax level and number of pca component using certain svm model and compere results if evaluate: microbiome_preprocess_evaluation(pca_options=list(range(2, max_pca)), tax_options=tax_list, tag_options=tag_list, old_preprocess=old_preprocess, algo=algo, method=method)
def psc(perform_distance=True, level=3): otu = 'C:/Users/Anna/Desktop/docs/otu_psc2.csv' mapping = 'C:/Users/Anna/Desktop/docs/mapping_psc.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) print('using padp') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) mapping_file = OtuMf.mapping_file mapping_disease = { 'Control': 0, 'Cirrhosis ': 1, 'HCC': 1, 'PSC+IBD': 2, 'PSC': 2 } mapping_file['DiagnosisGroup'] = mapping_file['DiagnosisGroup'].map( mapping_disease) mappin_boolean = {'yes': 1, 'no': 0, 'Control': 0, '0': 0, '1': 1} mapping_file['FattyLiver'] = mapping_file['FattyLiver'].map(mappin_boolean) mapping_file['RegularExercise'] = mapping_file['RegularExercise'].map( mappin_boolean) mapping_file['Smoking'] = mapping_file['Smoking'].map(mappin_boolean) mapping_file = mapping_file[[ 'Age', 'BMI', 'FattyLiver', 'RegularExercise', 'Smoking', 'DiagnosisGroup' ]] if perform_distance: cols = [ col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) print(col_name[-1]) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 for key, values in dict_bact.items(): new_data = preproccessed_data[values] pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 otu_after_pca_new, pca_components = apply_pca( new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): new_df[col + j] = otu_after_pca_new[j] col += num_comp return new_df, mapping_file else: return preproccessed_data, mapping_file
from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier # otu = 'C:/Users/Anna/Desktop/docs/otu_psc2.csv' # mapping = 'C:/Users/Anna/Desktop/docs/mapping_psc.csv' # otu = 'C:/Users/Anna/Documents/otu_IBD3.csv' # mapping = 'C:/Users/Anna/Documents/mapping_IBD3.csv' otu = 'C:/Users/Anna/Documents/otu_saliva_GVHD.csv' mapping = 'C:/Users/Anna/Documents/mapping_saliva_GVHD.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7, preform_z_scoring=False, preform_log=True) mapping_file = OtuMf.mapping_file mapping_file['DATE'] = pd.to_datetime(OtuMf.mapping_file['DATE']) mapping_file['Date_Of_Transplantation'] = pd.to_datetime( OtuMf.mapping_file['Date_Of_Transplantation']) mapping_file['Date_of_engraftmen'] = pd.to_datetime( OtuMf.mapping_file['Date_of_engraftmen']) mapping_file['aGVHD1_Stat'] = pd.to_datetime(OtuMf.mapping_file['aGVHD1_Stat']) mapping_file['cGVHD_start'] = pd.to_datetime(OtuMf.mapping_file['cGVHD_start']) end = pd.to_datetime('2020-01-01') mapping_file['aGVHD1_Stat'] = mapping_file['aGVHD1_Stat'].fillna(end) mapping_file['cGVHD_start'] = mapping_file['cGVHD_start'].fillna(end) mapping_file = mapping_file[
def allergies(perform_distance=False, level=3): otu = 'allergy_otu.csv' mapping = 'allergy_mf.csv' OtuMf = OtuMfHandler(otu, mapping, from_QIIME=False) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=7) preproccessed_data = preproccessed_data.join( OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner') #preproccessed_data = preproccessed_data.loc[(preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))] preproccessed_data = preproccessed_data.drop( ['AllergyType', 'SuccessDescription'], axis=1) #mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')] mapping_file = OtuMf.mapping_file mapping_disease = {'Milk': 1, 'Peanut': 0} mapping_health = {'Con': 1} mapping_success = {'A1': 1} mapping_file['Health'] = mapping_file['AllergyType'].map(mapping_health) mapping_file['AllergyType'] = mapping_file['AllergyType'].map( mapping_disease) mapping_file['SuccessDescription'] = mapping_file[ 'SuccessDescription'].map(mapping_success) mapping_file[['Health', 'SuccessDescription' ]] = mapping_file[['Health', 'SuccessDescription']].fillna(value=0) mapping_file = mapping_file[['AllergyType', 'SuccessDescription']] # if perform_distance: # cols = [col for col in preproccessed_data.columns if len(preproccessed_data[col].unique()) != 1] # dict_bact = {'else': []} # for col in preproccessed_data[cols]: # col_name = preproccessed_data[col].name.split(';') # bact_level = level - 1 # if len(col_name) > bact_level: # if col_name[bact_level] in dict_bact: # dict_bact[col_name[bact_level]].append(preproccessed_data[col].name) # else: # dict_bact[col_name[bact_level]] = [preproccessed_data[col].name] # else: # dict_bact['else'].append(preproccessed_data[col].name) # print(col_name[-1]) # # new_df = pd.DataFrame(index=preproccessed_data.index) # col = 0 # for key, values in dict_bact.items(): # new_data = preproccessed_data[values] # pca = PCA(n_components=round(new_data.shape[1] / 2) + 1) # pca.fit(new_data) # sum = 0 # num_comp = 0 # for (i, component) in enumerate(pca.explained_variance_ratio_): # if sum <= 0.5: # sum += component # else: # num_comp = i # break # if num_comp == 0: # num_comp += 1 # otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp) # for j in range(otu_after_pca_new.shape[1]): # new_df[col + j] = otu_after_pca_new[j] # col += num_comp # return new_df, mapping_file # else: return preproccessed_data, mapping_file