def main_preprocess(tax, site, trimester, preprocess_prms): main_task = 'GDM_taxonomy_level_' + str( tax) + '_' + site + '_trimester_' + trimester bactria_as_feature_file = 'GDM_OTU_rmv_dup.csv' samples_data_file = 'GDM_tag_rmv_dup_' + trimester + '_' + site + '.csv' rhos_folder = os.path.join('pregnancy_diabetes_' + trimester + '_' + site, 'rhos') pca_folder = os.path.join('pregnancy_diabetes_' + trimester + '_' + site, 'pca') mapping_file = CreateOtuAndMappingFiles(bactria_as_feature_file, samples_data_file) mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False) mapping_file.rhos_and_pca_calculation(main_task, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], rhos_folder, pca_folder) otu_path, mapping_path, pca_path = mapping_file.csv_to_learn( main_task, os.path.join(os.getcwd(), 'pregnancy_diabetes_' + trimester + '_' + site), tax) return otu_path, mapping_path, pca_path
import pandas as pd import pickle from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles from pathlib import Path otu_path = Path('../data/exp1/used_data/basic_data/otu.csv') mapping_path = Path('../data/exp1/used_data/basic_data/mapping_table.csv') otumf = CreateOtuAndMappingFiles(otu_path, mapping_path) otumf.to_correspond(left_index=True, right_index=True) tax = 6 # Preprocess the data and decompose it using pca to 2 dimensions preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'sub PCA', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': (2, 'PCA') } otumf.preprocess(preprocess_prms, visualize=False) otumf.otu_features_df.to_csv( Path('../data/exp1/used_data/basic_data/decomposed_table.csv')) with open(Path('../data/exp1/used_data/otumf_data/decomposed_otumf'), 'wb') as otumf_file: pickle.dump(otumf, otumf_file)
import sys, os from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles from pathlib import Path preprocess_prms = { 'taxonomy_level': 6, 'taxnomy_group': 'sub PCA', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': 'No', 'std_to_delete': 0, 'pca': (0, 'PCA') } otu_file = Path('Example_datasets/OTU.csv') tag_file = Path('Example_datasets/Tag.csv') task_name = 'Example_datasets' mapping_file = CreateOtuAndMappingFiles(otu_file, tag_file) mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=True) # mapping_file.rhos_and_pca_calculation(main_task, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], rhos_folder, pca_folder) otu_path, mapping_path, pca_path = mapping_file.csv_to_learn( task_name, os.path.join(os.getcwd(), task_name), preprocess_prms['taxonomy_level']) print('CSV files are ready after MIPMLP') print('OTU file', otu_path) print('mapping file', mapping_path) print('PCA object file', pca_path)
from integration_tools.utils.data.data_classes import DualDataset from integration_tools.utils.transforms.transforms_classes import ToTensor import pickle from pathlib import Path from torchvision import transforms from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles import pandas as pd otu_path = Path('../data/data_used/basic_data/otu_features.csv') tag_path = Path('../data/data_used/basic_data/tag_df.csv') metabolomics_path = Path('../data/data_used/basic_data/metabolomics_table.csv') # Create the Otu object metabolomics_table = pd.read_csv(metabolomics_path, index_col=0) otumf = CreateOtuAndMappingFiles(otu_path, tag_path) tax = 5 preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'sub PCA', 'epsilon': 0.5, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': (0, 'PCA') } otumf.preprocess(preprocess_prms, visualize=False) otumf.otu_features_df.to_csv( Path('../data/data_used/basic_data/preprocessed_otu.csv')) with open(Path('../data/data_used/otumf_data/otumf'), 'wb') as otumf_file:
def microbiome_preprocess(max_pca, tax_list, tag_list, old_preprocess=True, rho_pca_plots=False, evaluate=False, algo="svm", method="fold"): for tax in tax_list: for tag in tag_list: if old_preprocess: otu_file = "otu_id.csv" tag_file = tag + "_tag.csv" OtuMf = OtuMfHandler(otu_file, tag_file, from_QIIME=False, id_col='ID', taxonomy_col='taxonomy') preproccessed_data = preprocess_data( OtuMf.otu_file, preform_z_scoring=True, visualize_data=False, taxnomy_level=tax, preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=max_pca, visualize=False) folder = tag + "_tax_" + str(tax) + "_csv_files" otu_name = "old_processed_otu_" + tag + "_tax_" + str( tax) + ".csv" otu_after_pca_wo_taxonomy[ "ID"] = otu_after_pca_wo_taxonomy.index otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.set_index( "ID") if not os.path.exists(folder): os.mkdir(folder) otu_after_pca_wo_taxonomy.to_csv(os.path.join( folder, otu_name)) else: # yoel new Preprocess # parameters for Preprocess preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'mean', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': max_pca } mapping_file = CreateOtuAndMappingFiles( "otu.csv", tag + "_tag.csv") mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False) if rho_pca_plots: folder = "preprocess_plots_" + tag + "_tag_tax_" + str( tax) + "_pca_" + str(max_pca) mapping_file.rhos_and_pca_calculation( tag, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], os.path.join(folder, "rhos"), os.path.join(folder, "pca")) otu_path, tag_path, pca_path = mapping_file.csv_to_learn( tag + '_task', tag + "_tax_" + str(tax) + "_csv_files", tax, max_pca) print(otu_path) # compere tax level and number of pca component using certain svm model and compere results if evaluate: microbiome_preprocess_evaluation(pca_options=list(range(2, max_pca)), tax_options=tax_list, tag_options=tag_list, old_preprocess=old_preprocess, algo=algo, method=method)
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles from pathlib import Path from integration_tools.utils.data.data_classes import DualDataset from integration_tools.utils.transforms.transforms_classes import ToTensor from torchvision import transforms import pickle import os from datetime import datetime otu_paths = [Path('../data/data_used/basic_tables/stool_otu.csv'), Path('../data/data_used/basic_tables/saliva_otu.csv')] mapping_paths = [Path('../data/data_used/basic_tables/stool_mapping_table.csv'), Path('../data/data_used/basic_tables/saliva_mapping_table.csv')] otumf_names_list = ['stool_otumf','saliva_otumf'] tax = 6 otumf_list = [CreateOtuAndMappingFiles(otu_path, mapping_path) for otu_path, mapping_path in zip(otu_paths, mapping_paths)] preprocess_prms = {'taxonomy_level': tax, 'taxnomy_group': 'sub PCA', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': (0, 'PCA')} for otumf, otumf_name in zip(otumf_list, otumf_names_list): otumf.to_correspond(left_index=True ,right_index=True, how='inner') otumf.preprocess(preprocess_prms, False) with open(os.path.join('../data/data_used/otumf_objects/', otumf_name), 'wb') as otu_file: pickle.dump(otumf, otu_file) entities_dataset = DualDataset.from_sources(otumf_list[0].otu_features_df, otumf_list[1].otu_features_df, matching_info_source0=otumf_list[0].extra_features_df[['subjid', 'DATE']], matching_info_source1=otumf_list[1].extra_features_df[['subjid', 'DATE']], matching_fn=matching_fn,
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles from pathlib import Path import pickle otu_path = Path('../data/data_used/otu.csv') mapping_table_path = Path('../data/data_used/mapping_table.csv') otuMf = CreateOtuAndMappingFiles(otu_path, mapping_table_path) tax = 6 otuMf.to_correspond(left_index=True, right_index=True, how='inner') preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'mean', 'epsilon': 1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': (3, 'PCA') } otuMf.preprocess(preprocess_prms, visualize=False) # The axis number should start from 1 otuMf.otu_features_df.columns = list( map(lambda x: '{} {}'.format('Axis', str(x + 1)), otuMf.otu_features_df.columns)) with open(Path(f'../data/data_used/otuMF_{tax}'), 'wb') as otu_file: pickle.dump(otuMf, otu_file) otuMf.otu_features_df_b_pca.to_csv( Path('../data/data_used/preprocessed_otu_before_pca.csv')) otuMf.extra_features_df.to_csv(Path('../data/data_used/extra_features_df.csv'))
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles import pickle bactria_as_feature_file = '../data/data_used/table-with-taxonomy.csv' samples_data_file = '../data/data_used/samples_metadata.csv' tax = 6 preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'sub PCA', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': (0, 'PCA') } otuMf = CreateOtuAndMappingFiles(bactria_as_feature_file, samples_data_file) otuMf.remove_duplicates(['womanno.', 'trimester', 'body_site']) otuMf.conditional_identification({'Type': 'Mother'}) otuMf.to_correspond(left_index=True, right_index=True, how='inner') otuMf.preprocess(preprocess_prms, visualize=False) with open('../data/data_used/otuMF', 'wb') as otu_file: pickle.dump(otuMf, otu_file)
def main_pipeline(): main_task = 'prognostic_PTSD_task_tax_level_' bactria_as_feature_file = '../sderot_anxiety/PTSD_data.csv' samples_data_file = '../sderot_anxiety/PTSD_tag.csv' rhos_folder = os.path.join('..', 'sderot_anxiety', 'rhos') pca_folder = os.path.join('..', 'sderot_anxiety', 'pca') taxonomy_range = [5, 6] pca_range = range(2, 4) box_c_range = [pow(10, i) for i in range(-3, -1)] # parameters for Preprocess for tax in taxonomy_range: for pca in pca_range: task = main_task + str(tax) + '_pca_' + str(pca) preprocess_prms = { 'taxonomy_level': tax, 'taxnomy_group': 'mean', 'epsilon': 0.1, 'normalization': 'log', 'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': pca } mapping_file = CreateOtuAndMappingFiles(bactria_as_feature_file, samples_data_file) mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False) mapping_file.rhos_and_pca_calculation( task, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], rhos_folder, pca_folder) otu_path, mapping_path, pca_path = mapping_file.csv_to_learn( task, os.getcwd(), tax) # run svm learning folder = 'sderot_anxiety' k_fold = 17 test_size = 0.2 names = ["no anxiety", "anxiety"] # get params dictionary from file / create it here p_dict = { "TASK_TITLE": task, "FOLDER_TITLE": main_task, "TAX_LEVEL": str(tax), "CLASSES_NAMES": names, "SVM": True, "SVM_params": { 'kernel': ['linear'], 'gamma': ['scale'], 'C': box_c_range, "create_coeff_plots": True, "CLASSES_NAMES": names, "K_FOLD": k_fold, "TEST_SIZE": test_size, "TASK_TITLE": task }, # if single option for each param -> single run, otherwise -> grid search. "XGB": True, "XGB_params": { 'learning_rate': [0.1], 'objective': ['binary:logistic'], 'n_estimators': [1000], 'max_depth': [7], 'min_child_weight': [1], 'gamma': [1], "create_coeff_plots": True, "CLASSES_NAMES": names, "K_FOLD": k_fold, "TEST_SIZE": test_size, "TASK_TITLE": "sderot_anxiety" }, # if single option for each param -> single run, otherwise -> grid search. "NN": True, "NN_params": { "hid_dim_0": 120, "hid_dim_1": 160, "reg": 0.68, "lr": 0.001, "test_size": 0.1, "batch_size": 32, "shuffle": 1, "num_workers": 4, "epochs": 150, "optimizer": 'SGD', "loss": 'MSE', "model": 'tanh_b' }, # if single option for each param -> single run, otherwise -> grid search. "NNI": False, "NNI_params": { "result_type": 'auc' }, # enter to model params? might want to change for different models.. "K_FOLD": k_fold, "TEST_SIZE": test_size, # ...... add whatever } main(folder, otu_path, mapping_path, pca_path, p_dict) os.chdir('..')
def read_file(self, title, bactria_as_feature_file, samples_data_file, preprocess_prms, tax): otu_and_mapping_file = CreateOtuAndMappingFiles( bactria_as_feature_file, samples_data_file) otu_and_mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False) preproccessed_data = otu_and_mapping_file.otu_features_df tag_file = otu_and_mapping_file.tags_df.join( otu_and_mapping_file.extra_features_df, how='outer') otu_path, tag_path, pca_path = otu_and_mapping_file.csv_to_learn( title + '_task', os.path.join('..', "Datasets", title), tax=tax, pca_n=otu_and_mapping_file.pca) preproccessed_data.index = [str(id) for id in preproccessed_data.index] tag_file.index = [str(id) for id in tag_file.index] index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(preproccessed_data.values): id_to_features_map[preproccessed_data.index[i]] = row index_to_id_map[i] = preproccessed_data.index[i] self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map data_ids_list = preproccessed_data.index.tolist() tag_ids_list = tag_file.index.tolist() ids_list = [id for id in data_ids_list if id in tag_ids_list] self._ids_list = ids_list patient_column = 'Patient' id_to_patient_map = {} for sample in ids_list: child_num = tag_file.loc[sample, patient_column] id_to_patient_map[sample] = child_num # create time series for each child child_to_ids_map = { child: [] for child in set(id_to_patient_map.values()) } for key, val in id_to_patient_map.items(): child_to_ids_map[val].append(key) for serie in child_to_ids_map.values(): serie.sort() time_column = 'Time' id_to_time_map = {} for sample in ids_list: period = tag_file.loc[sample, time_column] id_to_time_map[sample] = period periods = list(set(id_to_time_map.values())) periods.sort() period_to_index = {p: i for i, p in enumerate(periods)} ids_list = [] features_list = [] for key, val in id_to_features_map.items(): ids_list.append(key[:-1]) features_list.append(list(val)) with open(os.path.join(self.load_and_save_path, "bacteria.txt"), "w") as b_file: for b in otu_and_mapping_file.bacteria: b_file.write(b + "\n") self.mapping_file = tag_file self.ids_list = ids_list self.features_list = features_list self.data_set_tax_path = tax self.bacteria = otu_and_mapping_file.bacteria self.period_column = time_column self.id_to_period_map = id_to_time_map self.id_to_participant_map = id_to_patient_map self.id_to_features_map = id_to_features_map self.participant_to_ids_map = child_to_ids_map self.period_to_index = period_to_index
def parallel_pipeline(arg, task): tax_dict = ['four', 'five', 'six'] k_fold = 10 # parameters for Preprocess preprocess_prms = { 'taxonomy_level': int(arg[0]), 'taxnomy_group': arg[1], 'epsilon': 0.1, 'normalization': arg[2][0], 'z_scoring': arg[2][1], 'norm_after_rel': arg[2][1], 'std_to_delete': 0, 'pca': arg[3] } main_task = task + '_taxonomy_level_' + str(arg[0]) bactria_as_feature_file = task + '_OTU.csv' samples_data_file = task + '_Tag.csv' rhos_folder = os.path.join(task, 'rhos') pca_folder = os.path.join(task, 'pca') mapping_file = CreateOtuAndMappingFiles(bactria_as_feature_file, samples_data_file) mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False) #mapping_file.rhos_and_pca_calculation(main_task, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], # rhos_folder, pca_folder) otu_path, mapping_path, pca_path = mapping_file.csv_to_learn( main_task, os.path.join(os.getcwd(), task), arg[0]) #create X,Y X, y = read_otu_and_mapping_files(otu_path, mapping_path) #split data X_trains, X_tests, y_trains, y_tests = split(X, y, 0.2, k_fold) #learn SVM_train_auc, SVM_test_auc, XGB_train_auc, XGB_test_auc, NN_train_auc, NN_test_auc = learn( X_trains, X_tests, y_trains, y_tests, k_fold, arg[1]) print('taxonomy ' + str(arg[0]) + ' ' + str(arg[1]) + ' ' + str(arg[2][0]) + ' ' + str(arg[2][1]) + ' PCA ' + str(mapping_file.pca_comp) + ':') print("SVM Train AUC: " + str(SVM_train_auc)) print("SVM Test AUC: " + str(SVM_test_auc)) print("XGBOOST Train AUC: " + str(XGB_train_auc)) print("XGBOOST Test AUC: " + str(XGB_test_auc)) print("NN Train AUC: " + str(NN_train_auc)) print("NN Test AUC: " + str(NN_test_auc)) svm_results_file = Path(task + "/all_svm_results_ica.csv") if not svm_results_file.exists(): all_svm_results = pd.DataFrame(columns=[ 'Taxonomy level', 'taxonomy group', 'normalization 1', 'normalization 2', 'Dim Red', 'TRAIN-AUC', 'TEST-AUC' ]) all_svm_results.to_csv(svm_results_file, index=False) all_svm_results = pd.read_csv(svm_results_file) all_svm_results.loc[len(all_svm_results)] = [ tax_dict[arg[0] - 4], arg[1], arg[2][0], arg[2][1], arg[3][1], SVM_train_auc, SVM_test_auc ] all_svm_results.to_csv(svm_results_file, index=False) xgboost_results_file = Path(task + "/all_xgboost_results_ica.csv") if not xgboost_results_file.exists(): all_xgboost_results = pd.DataFrame(columns=[ 'Taxonomy level', 'taxonomy group', 'normalization 1', 'normalization 2', 'Dim Red', 'TRAIN-AUC', 'TEST-AUC' ]) all_xgboost_results.to_csv(xgboost_results_file, index=False) all_xgboost_results = pd.read_csv(xgboost_results_file) all_xgboost_results.loc[len(all_xgboost_results)] = [ tax_dict[arg[0] - 4], arg[1], arg[2][0], arg[2][1], arg[3][1], XGB_train_auc, XGB_test_auc ] all_xgboost_results.to_csv(xgboost_results_file, index=False) nn_results_file = Path(task + "/all_nn_results_ica.csv") if not nn_results_file.exists(): all_nn_results = pd.DataFrame(columns=[ 'Taxonomy level', 'taxonomy group', 'normalization 1', 'normalization 2', 'Dim Red', 'TRAIN-AUC', 'TEST-AUC' ]) all_nn_results.to_csv(nn_results_file, index=False) all_nn_results = pd.read_csv(nn_results_file) all_nn_results.loc[len(all_nn_results)] = [ tax_dict[arg[0] - 4], arg[1], arg[2][0], arg[2][1], arg[3][1], NN_train_auc, NN_test_auc ] all_nn_results.to_csv(nn_results_file, index=False)