Ejemplo n.º 1
0
def main_preprocess(tax, site, trimester, preprocess_prms):
    main_task = 'GDM_taxonomy_level_' + str(
        tax) + '_' + site + '_trimester_' + trimester
    bactria_as_feature_file = 'GDM_OTU_rmv_dup.csv'

    samples_data_file = 'GDM_tag_rmv_dup_' + trimester + '_' + site + '.csv'
    rhos_folder = os.path.join('pregnancy_diabetes_' + trimester + '_' + site,
                               'rhos')
    pca_folder = os.path.join('pregnancy_diabetes_' + trimester + '_' + site,
                              'pca')

    mapping_file = CreateOtuAndMappingFiles(bactria_as_feature_file,
                                            samples_data_file)
    mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False)
    mapping_file.rhos_and_pca_calculation(main_task,
                                          preprocess_prms['taxonomy_level'],
                                          preprocess_prms['pca'], rhos_folder,
                                          pca_folder)
    otu_path, mapping_path, pca_path = mapping_file.csv_to_learn(
        main_task,
        os.path.join(os.getcwd(),
                     'pregnancy_diabetes_' + trimester + '_' + site), tax)
    return otu_path, mapping_path, pca_path
Ejemplo n.º 2
0
import pandas as pd
import pickle
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles
from pathlib import Path
otu_path = Path('../data/exp1/used_data/basic_data/otu.csv')
mapping_path = Path('../data/exp1/used_data/basic_data/mapping_table.csv')

otumf = CreateOtuAndMappingFiles(otu_path, mapping_path)
otumf.to_correspond(left_index=True, right_index=True)
tax = 6

# Preprocess the data and decompose it using pca to 2 dimensions
preprocess_prms = {
    'taxonomy_level': tax,
    'taxnomy_group': 'sub PCA',
    'epsilon': 0.1,
    'normalization': 'log',
    'z_scoring': 'row',
    'norm_after_rel': '',
    'std_to_delete': 0,
    'pca': (2, 'PCA')
}
otumf.preprocess(preprocess_prms, visualize=False)
otumf.otu_features_df.to_csv(
    Path('../data/exp1/used_data/basic_data/decomposed_table.csv'))
with open(Path('../data/exp1/used_data/otumf_data/decomposed_otumf'),
          'wb') as otumf_file:
    pickle.dump(otumf, otumf_file)
Ejemplo n.º 3
0
import sys, os
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles
from pathlib import Path
preprocess_prms = {
    'taxonomy_level': 6,
    'taxnomy_group': 'sub PCA',
    'epsilon': 0.1,
    'normalization': 'log',
    'z_scoring': 'row',
    'norm_after_rel': 'No',
    'std_to_delete': 0,
    'pca': (0, 'PCA')
}

otu_file = Path('Example_datasets/OTU.csv')
tag_file = Path('Example_datasets/Tag.csv')
task_name = 'Example_datasets'

mapping_file = CreateOtuAndMappingFiles(otu_file, tag_file)
mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=True)
# mapping_file.rhos_and_pca_calculation(main_task, preprocess_prms['taxonomy_level'], preprocess_prms['pca'], rhos_folder, pca_folder)
otu_path, mapping_path, pca_path = mapping_file.csv_to_learn(
    task_name, os.path.join(os.getcwd(), task_name),
    preprocess_prms['taxonomy_level'])
print('CSV files are ready after MIPMLP')
print('OTU file', otu_path)
print('mapping file', mapping_path)
print('PCA object file', pca_path)
Ejemplo n.º 4
0
from integration_tools.utils.data.data_classes import DualDataset
from integration_tools.utils.transforms.transforms_classes import ToTensor
import pickle
from pathlib import Path
from torchvision import transforms
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles
import pandas as pd

otu_path = Path('../data/data_used/basic_data/otu_features.csv')
tag_path = Path('../data/data_used/basic_data/tag_df.csv')
metabolomics_path = Path('../data/data_used/basic_data/metabolomics_table.csv')
# Create the Otu object
metabolomics_table = pd.read_csv(metabolomics_path, index_col=0)

otumf = CreateOtuAndMappingFiles(otu_path, tag_path)
tax = 5

preprocess_prms = {
    'taxonomy_level': tax,
    'taxnomy_group': 'sub PCA',
    'epsilon': 0.5,
    'normalization': 'log',
    'z_scoring': 'row',
    'norm_after_rel': '',
    'std_to_delete': 0,
    'pca': (0, 'PCA')
}
otumf.preprocess(preprocess_prms, visualize=False)
otumf.otu_features_df.to_csv(
    Path('../data/data_used/basic_data/preprocessed_otu.csv'))
with open(Path('../data/data_used/otumf_data/otumf'), 'wb') as otumf_file:
Ejemplo n.º 5
0
def microbiome_preprocess(max_pca,
                          tax_list,
                          tag_list,
                          old_preprocess=True,
                          rho_pca_plots=False,
                          evaluate=False,
                          algo="svm",
                          method="fold"):
    for tax in tax_list:
        for tag in tag_list:
            if old_preprocess:
                otu_file = "otu_id.csv"
                tag_file = tag + "_tag.csv"
                OtuMf = OtuMfHandler(otu_file,
                                     tag_file,
                                     from_QIIME=False,
                                     id_col='ID',
                                     taxonomy_col='taxonomy')

                preproccessed_data = preprocess_data(
                    OtuMf.otu_file,
                    preform_z_scoring=True,
                    visualize_data=False,
                    taxnomy_level=tax,
                    preform_taxnomy_group=True)

                otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
                    preproccessed_data, n_components=max_pca, visualize=False)
                folder = tag + "_tax_" + str(tax) + "_csv_files"
                otu_name = "old_processed_otu_" + tag + "_tax_" + str(
                    tax) + ".csv"
                otu_after_pca_wo_taxonomy[
                    "ID"] = otu_after_pca_wo_taxonomy.index
                otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.set_index(
                    "ID")
                if not os.path.exists(folder):
                    os.mkdir(folder)
                otu_after_pca_wo_taxonomy.to_csv(os.path.join(
                    folder, otu_name))

            else:  # yoel new Preprocess
                # parameters for Preprocess
                preprocess_prms = {
                    'taxonomy_level': tax,
                    'taxnomy_group': 'mean',
                    'epsilon': 0.1,
                    'normalization': 'log',
                    'z_scoring': 'row',
                    'norm_after_rel': '',
                    'std_to_delete': 0,
                    'pca': max_pca
                }

                mapping_file = CreateOtuAndMappingFiles(
                    "otu.csv", tag + "_tag.csv")
                mapping_file.preprocess(preprocess_params=preprocess_prms,
                                        visualize=False)

                if rho_pca_plots:
                    folder = "preprocess_plots_" + tag + "_tag_tax_" + str(
                        tax) + "_pca_" + str(max_pca)
                    mapping_file.rhos_and_pca_calculation(
                        tag, preprocess_prms['taxonomy_level'],
                        preprocess_prms['pca'], os.path.join(folder, "rhos"),
                        os.path.join(folder, "pca"))

                otu_path, tag_path, pca_path = mapping_file.csv_to_learn(
                    tag + '_task', tag + "_tax_" + str(tax) + "_csv_files",
                    tax, max_pca)
                print(otu_path)

    # compere tax level and number of pca component using certain svm model and compere results
    if evaluate:
        microbiome_preprocess_evaluation(pca_options=list(range(2, max_pca)),
                                         tax_options=tax_list,
                                         tag_options=tag_list,
                                         old_preprocess=old_preprocess,
                                         algo=algo,
                                         method=method)
Ejemplo n.º 6
0
 from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles
from pathlib import Path
from integration_tools.utils.data.data_classes import DualDataset
from integration_tools.utils.transforms.transforms_classes import ToTensor
from torchvision import transforms
import pickle
import os
from datetime import datetime

otu_paths = [Path('../data/data_used/basic_tables/stool_otu.csv'),
             Path('../data/data_used/basic_tables/saliva_otu.csv')]
mapping_paths = [Path('../data/data_used/basic_tables/stool_mapping_table.csv'),
                 Path('../data/data_used/basic_tables/saliva_mapping_table.csv')]
otumf_names_list = ['stool_otumf','saliva_otumf']
tax = 6
otumf_list = [CreateOtuAndMappingFiles(otu_path, mapping_path) for otu_path, mapping_path in
              zip(otu_paths, mapping_paths)]

preprocess_prms = {'taxonomy_level': tax, 'taxnomy_group': 'sub PCA', 'epsilon': 0.1, 'normalization': 'log',
                   'z_scoring': 'row', 'norm_after_rel': '', 'std_to_delete': 0, 'pca': (0, 'PCA')}

for otumf, otumf_name in zip(otumf_list, otumf_names_list):
    otumf.to_correspond(left_index=True ,right_index=True, how='inner')
    otumf.preprocess(preprocess_prms, False)
    with open(os.path.join('../data/data_used/otumf_objects/', otumf_name), 'wb') as otu_file:
        pickle.dump(otumf, otu_file)

entities_dataset = DualDataset.from_sources(otumf_list[0].otu_features_df, otumf_list[1].otu_features_df,
                                            matching_info_source0=otumf_list[0].extra_features_df[['subjid', 'DATE']],
                                            matching_info_source1=otumf_list[1].extra_features_df[['subjid', 'DATE']],
                                            matching_fn=matching_fn,
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles
from pathlib import Path
import pickle
otu_path = Path('../data/data_used/otu.csv')
mapping_table_path = Path('../data/data_used/mapping_table.csv')
otuMf = CreateOtuAndMappingFiles(otu_path, mapping_table_path)
tax = 6
otuMf.to_correspond(left_index=True, right_index=True, how='inner')
preprocess_prms = {
    'taxonomy_level': tax,
    'taxnomy_group': 'mean',
    'epsilon': 1,
    'normalization': 'log',
    'z_scoring': 'row',
    'norm_after_rel': '',
    'std_to_delete': 0,
    'pca': (3, 'PCA')
}

otuMf.preprocess(preprocess_prms, visualize=False)
# The axis number should start from 1
otuMf.otu_features_df.columns = list(
    map(lambda x: '{} {}'.format('Axis', str(x + 1)),
        otuMf.otu_features_df.columns))

with open(Path(f'../data/data_used/otuMF_{tax}'), 'wb') as otu_file:
    pickle.dump(otuMf, otu_file)

otuMf.otu_features_df_b_pca.to_csv(
    Path('../data/data_used/preprocessed_otu_before_pca.csv'))
otuMf.extra_features_df.to_csv(Path('../data/data_used/extra_features_df.csv'))
Ejemplo n.º 8
0
from LearningMethods.create_otu_and_mapping_files import CreateOtuAndMappingFiles
import pickle
bactria_as_feature_file = '../data/data_used/table-with-taxonomy.csv'
samples_data_file = '../data/data_used/samples_metadata.csv'
tax = 6

preprocess_prms = {
    'taxonomy_level': tax,
    'taxnomy_group': 'sub PCA',
    'epsilon': 0.1,
    'normalization': 'log',
    'z_scoring': 'row',
    'norm_after_rel': '',
    'std_to_delete': 0,
    'pca': (0, 'PCA')
}
otuMf = CreateOtuAndMappingFiles(bactria_as_feature_file, samples_data_file)
otuMf.remove_duplicates(['womanno.', 'trimester', 'body_site'])
otuMf.conditional_identification({'Type': 'Mother'})
otuMf.to_correspond(left_index=True, right_index=True, how='inner')
otuMf.preprocess(preprocess_prms, visualize=False)

with open('../data/data_used/otuMF', 'wb') as otu_file:
    pickle.dump(otuMf, otu_file)
Ejemplo n.º 9
0
def main_pipeline():
    main_task = 'prognostic_PTSD_task_tax_level_'
    bactria_as_feature_file = '../sderot_anxiety/PTSD_data.csv'
    samples_data_file = '../sderot_anxiety/PTSD_tag.csv'
    rhos_folder = os.path.join('..', 'sderot_anxiety', 'rhos')
    pca_folder = os.path.join('..', 'sderot_anxiety', 'pca')

    taxonomy_range = [5, 6]
    pca_range = range(2, 4)
    box_c_range = [pow(10, i) for i in range(-3, -1)]
    # parameters for Preprocess
    for tax in taxonomy_range:
        for pca in pca_range:
            task = main_task + str(tax) + '_pca_' + str(pca)
            preprocess_prms = {
                'taxonomy_level': tax,
                'taxnomy_group': 'mean',
                'epsilon': 0.1,
                'normalization': 'log',
                'z_scoring': 'row',
                'norm_after_rel': '',
                'std_to_delete': 0,
                'pca': pca
            }
            mapping_file = CreateOtuAndMappingFiles(bactria_as_feature_file,
                                                    samples_data_file)
            mapping_file.preprocess(preprocess_params=preprocess_prms,
                                    visualize=False)
            mapping_file.rhos_and_pca_calculation(
                task, preprocess_prms['taxonomy_level'],
                preprocess_prms['pca'], rhos_folder, pca_folder)
            otu_path, mapping_path, pca_path = mapping_file.csv_to_learn(
                task, os.getcwd(), tax)

            # run svm learning
            folder = 'sderot_anxiety'
            k_fold = 17
            test_size = 0.2
            names = ["no anxiety", "anxiety"]
            # get params dictionary from file / create it here
            p_dict = {
                "TASK_TITLE": task,
                "FOLDER_TITLE": main_task,
                "TAX_LEVEL": str(tax),
                "CLASSES_NAMES": names,
                "SVM": True,
                "SVM_params": {
                    'kernel': ['linear'],
                    'gamma': ['scale'],
                    'C': box_c_range,
                    "create_coeff_plots": True,
                    "CLASSES_NAMES": names,
                    "K_FOLD": k_fold,
                    "TEST_SIZE": test_size,
                    "TASK_TITLE": task
                },
                # if single option for each param -> single run, otherwise -> grid search.
                "XGB": True,
                "XGB_params": {
                    'learning_rate': [0.1],
                    'objective': ['binary:logistic'],
                    'n_estimators': [1000],
                    'max_depth': [7],
                    'min_child_weight': [1],
                    'gamma': [1],
                    "create_coeff_plots": True,
                    "CLASSES_NAMES": names,
                    "K_FOLD": k_fold,
                    "TEST_SIZE": test_size,
                    "TASK_TITLE": "sderot_anxiety"
                },  # if single option for each param -> single run, otherwise -> grid search.
                "NN": True,
                "NN_params": {
                    "hid_dim_0": 120,
                    "hid_dim_1": 160,
                    "reg": 0.68,
                    "lr": 0.001,
                    "test_size": 0.1,
                    "batch_size": 32,
                    "shuffle": 1,
                    "num_workers": 4,
                    "epochs": 150,
                    "optimizer": 'SGD',
                    "loss": 'MSE',
                    "model": 'tanh_b'
                },  # if single option for each param -> single run, otherwise -> grid search.
                "NNI": False,
                "NNI_params": {
                    "result_type": 'auc'
                },
                # enter to model params?  might want to change for different models..
                "K_FOLD": k_fold,
                "TEST_SIZE": test_size,
                #  ...... add whatever
            }
            main(folder, otu_path, mapping_path, pca_path, p_dict)
            os.chdir('..')
Ejemplo n.º 10
0
    def read_file(self, title, bactria_as_feature_file, samples_data_file,
                  preprocess_prms, tax):
        otu_and_mapping_file = CreateOtuAndMappingFiles(
            bactria_as_feature_file, samples_data_file)
        otu_and_mapping_file.preprocess(preprocess_params=preprocess_prms,
                                        visualize=False)
        preproccessed_data = otu_and_mapping_file.otu_features_df
        tag_file = otu_and_mapping_file.tags_df.join(
            otu_and_mapping_file.extra_features_df, how='outer')

        otu_path, tag_path, pca_path = otu_and_mapping_file.csv_to_learn(
            title + '_task',
            os.path.join('..', "Datasets", title),
            tax=tax,
            pca_n=otu_and_mapping_file.pca)
        preproccessed_data.index = [str(id) for id in preproccessed_data.index]
        tag_file.index = [str(id) for id in tag_file.index]

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(preproccessed_data.values):
            id_to_features_map[preproccessed_data.index[i]] = row
            index_to_id_map[i] = preproccessed_data.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map

        data_ids_list = preproccessed_data.index.tolist()
        tag_ids_list = tag_file.index.tolist()
        ids_list = [id for id in data_ids_list if id in tag_ids_list]
        self._ids_list = ids_list

        patient_column = 'Patient'
        id_to_patient_map = {}
        for sample in ids_list:
            child_num = tag_file.loc[sample, patient_column]
            id_to_patient_map[sample] = child_num

        # create time series for each child
        child_to_ids_map = {
            child: []
            for child in set(id_to_patient_map.values())
        }
        for key, val in id_to_patient_map.items():
            child_to_ids_map[val].append(key)
        for serie in child_to_ids_map.values():
            serie.sort()

        time_column = 'Time'
        id_to_time_map = {}
        for sample in ids_list:
            period = tag_file.loc[sample, time_column]
            id_to_time_map[sample] = period
        periods = list(set(id_to_time_map.values()))
        periods.sort()
        period_to_index = {p: i for i, p in enumerate(periods)}

        ids_list = []
        features_list = []
        for key, val in id_to_features_map.items():
            ids_list.append(key[:-1])
            features_list.append(list(val))

        with open(os.path.join(self.load_and_save_path, "bacteria.txt"),
                  "w") as b_file:
            for b in otu_and_mapping_file.bacteria:
                b_file.write(b + "\n")

        self.mapping_file = tag_file
        self.ids_list = ids_list
        self.features_list = features_list
        self.data_set_tax_path = tax
        self.bacteria = otu_and_mapping_file.bacteria
        self.period_column = time_column
        self.id_to_period_map = id_to_time_map
        self.id_to_participant_map = id_to_patient_map
        self.id_to_features_map = id_to_features_map
        self.participant_to_ids_map = child_to_ids_map
        self.period_to_index = period_to_index
Ejemplo n.º 11
0
def parallel_pipeline(arg, task):
    tax_dict = ['four', 'five', 'six']
    k_fold = 10
    # parameters for Preprocess
    preprocess_prms = {
        'taxonomy_level': int(arg[0]),
        'taxnomy_group': arg[1],
        'epsilon': 0.1,
        'normalization': arg[2][0],
        'z_scoring': arg[2][1],
        'norm_after_rel': arg[2][1],
        'std_to_delete': 0,
        'pca': arg[3]
    }

    main_task = task + '_taxonomy_level_' + str(arg[0])
    bactria_as_feature_file = task + '_OTU.csv'
    samples_data_file = task + '_Tag.csv'

    rhos_folder = os.path.join(task, 'rhos')
    pca_folder = os.path.join(task, 'pca')

    mapping_file = CreateOtuAndMappingFiles(bactria_as_feature_file,
                                            samples_data_file)
    mapping_file.preprocess(preprocess_params=preprocess_prms, visualize=False)
    #mapping_file.rhos_and_pca_calculation(main_task, preprocess_prms['taxonomy_level'], preprocess_prms['pca'],
    #                                     rhos_folder, pca_folder)
    otu_path, mapping_path, pca_path = mapping_file.csv_to_learn(
        main_task, os.path.join(os.getcwd(), task), arg[0])

    #create X,Y
    X, y = read_otu_and_mapping_files(otu_path, mapping_path)
    #split data
    X_trains, X_tests, y_trains, y_tests = split(X, y, 0.2, k_fold)
    #learn
    SVM_train_auc, SVM_test_auc, XGB_train_auc, XGB_test_auc, NN_train_auc, NN_test_auc = learn(
        X_trains, X_tests, y_trains, y_tests, k_fold, arg[1])
    print('taxonomy ' + str(arg[0]) + ' ' + str(arg[1]) + ' ' +
          str(arg[2][0]) + ' ' + str(arg[2][1]) + ' PCA ' +
          str(mapping_file.pca_comp) + ':')
    print("SVM Train AUC: " + str(SVM_train_auc))
    print("SVM Test AUC: " + str(SVM_test_auc))
    print("XGBOOST Train AUC: " + str(XGB_train_auc))
    print("XGBOOST Test AUC: " + str(XGB_test_auc))
    print("NN Train AUC: " + str(NN_train_auc))
    print("NN Test AUC: " + str(NN_test_auc))

    svm_results_file = Path(task + "/all_svm_results_ica.csv")
    if not svm_results_file.exists():
        all_svm_results = pd.DataFrame(columns=[
            'Taxonomy level', 'taxonomy group', 'normalization 1',
            'normalization 2', 'Dim Red', 'TRAIN-AUC', 'TEST-AUC'
        ])
        all_svm_results.to_csv(svm_results_file, index=False)
    all_svm_results = pd.read_csv(svm_results_file)
    all_svm_results.loc[len(all_svm_results)] = [
        tax_dict[arg[0] - 4], arg[1], arg[2][0], arg[2][1], arg[3][1],
        SVM_train_auc, SVM_test_auc
    ]
    all_svm_results.to_csv(svm_results_file, index=False)

    xgboost_results_file = Path(task + "/all_xgboost_results_ica.csv")
    if not xgboost_results_file.exists():
        all_xgboost_results = pd.DataFrame(columns=[
            'Taxonomy level', 'taxonomy group', 'normalization 1',
            'normalization 2', 'Dim Red', 'TRAIN-AUC', 'TEST-AUC'
        ])
        all_xgboost_results.to_csv(xgboost_results_file, index=False)
    all_xgboost_results = pd.read_csv(xgboost_results_file)
    all_xgboost_results.loc[len(all_xgboost_results)] = [
        tax_dict[arg[0] - 4], arg[1], arg[2][0], arg[2][1], arg[3][1],
        XGB_train_auc, XGB_test_auc
    ]
    all_xgboost_results.to_csv(xgboost_results_file, index=False)

    nn_results_file = Path(task + "/all_nn_results_ica.csv")
    if not nn_results_file.exists():
        all_nn_results = pd.DataFrame(columns=[
            'Taxonomy level', 'taxonomy group', 'normalization 1',
            'normalization 2', 'Dim Red', 'TRAIN-AUC', 'TEST-AUC'
        ])
        all_nn_results.to_csv(nn_results_file, index=False)
    all_nn_results = pd.read_csv(nn_results_file)
    all_nn_results.loc[len(all_nn_results)] = [
        tax_dict[arg[0] - 4], arg[1], arg[2][0], arg[2][1], arg[3][1],
        NN_train_auc, NN_test_auc
    ]
    all_nn_results.to_csv(nn_results_file, index=False)