예제 #1
0
def distance_learning(perform_distance, level, preproccessed_data,
                      mapping_file):
    if perform_distance:
        cols = [
            col for col in preproccessed_data.columns
            if preproccessed_data[col].nunique() != 1
        ]
        dict_bact = {'else': []}
        for col in preproccessed_data[cols]:
            col_name = preproccessed_data[col].name.split(';')
            bact_level = level - 1
            if len(col_name) > bact_level:
                if col_name[bact_level] in dict_bact:
                    dict_bact[col_name[bact_level]].append(
                        preproccessed_data[col].name)
                else:
                    dict_bact[col_name[bact_level]] = [
                        preproccessed_data[col].name
                    ]
            else:
                dict_bact['else'].append(preproccessed_data[col].name)
            #print(dict_bact)

        new_df = pd.DataFrame(index=preproccessed_data.index)
        col = 0
        for key, values in dict_bact.items():
            if values:
                new_data = preproccessed_data[values]
                pca = PCA(n_components=min(
                    round(new_data.shape[1] / 2) + 1, new_data.shape[0]))
                pca.fit(new_data)
                sum = 0
                num_comp = 0
                for (i, component) in enumerate(pca.explained_variance_ratio_):
                    if sum <= 0.5:
                        sum += component
                    else:
                        num_comp = i
                        break
                if num_comp == 0:
                    num_comp += 1
                # new
                otu_after_pca_new, pca_obj, pca_str = apply_pca(
                    new_data, n_components=num_comp)
                # old
                # otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp)
                for j in range(otu_after_pca_new.shape[1]):
                    if key == 'else':
                        new_df['else;'] = otu_after_pca_new[j]
                    else:
                        new_df[str(values[0][0:values[0].find(key) + len(key)])
                               + '_' + str(j)] = otu_after_pca_new[j]
                col += num_comp
        return new_df, mapping_file
    else:
        return preproccessed_data, mapping_file
예제 #2
0
def get_days(days_datetime):
    return days_datetime.days


n_components = 20

OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'),
                     os.path.join(SCRIPT_DIR,
                                  'saliva_samples_mapping_file_231018.csv'),
                     from_QIIME=True)
preproccessed_data = preprocess_data(OtuMf.otu_file,
                                     visualize_data=True,
                                     taxnomy_level=5)
otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data,
                                         n_components=n_components,
                                         visualize=False)
# otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy)
# merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy)
# merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days'])
# merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days
# merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']])
# merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days

# OtuMf.mapping_file.apply(lambda x: -999 if x['Mucositis_Start'] is None else (datetime.datetime.strptime(x['DATE'], '%d/%m/%Y') - datetime.datetime.strptime(x['Mucositis_Start'], '%d/%m/%Y')).days)

OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply(
    get_datetime)
OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file[
    'Mucositis_Start'].apply(get_datetime)
OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file[
예제 #3
0
    def _read_file(self, title, bactria_as_feature_file, samples_data_file, allow_printing, perform_anna_preprocess, visualize_pre, re_arange):
        
        sample = "SALIVA"


        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=False, id_col='ID',
                             taxonomy_col='taxonomy')
        #rare_bacteria = self.find_rare_bacteria(OtuMf)
        #OtuMf = self.drop_rare_bacteria(rare_bacteria, OtuMf)
        OtuMf = self.remove_duplicate(OtuMf)
        OtuMf = self.rearange_data(OtuMf, re_arange)
        #OtuMf.otu_file.T.to_csv("GDM_OTU_rmv_dup_arrange.csv")
        OtuMf.mapping_file.to_csv("GDM_tag_rmv_dup.csv")
        #returnmapping_file
        return
        
        if perform_anna_preprocess:
            preproccessed_data = preprocess_data(OtuMf.otu_file.T, visualize_data=False, taxonomy_col='taxonomy',
                                                 taxnomy_level=8)
            mapping_file = OtuMf.mapping_file['Control_GDM']
            mapping_disease = {'Control': 0,
                               'GDM': 1}
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(perform_distance=True, level=4,
                                                                 preproccessed_data=preproccessed_data,
                                                                 mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data
            self._preproccessed_data.to_csv('anna_pca_old_loader.csv')
        else:
            if re_arange != 0:
                OtuMf = self.rearange_data(OtuMf, re_arange)
            preproccessed_data = preprocess_data(OtuMf.otu_file.T, visualize_data=visualize_pre, taxnomy_level=self._taxnomy_level,
                                                 taxonomy_col='taxonomy', preform_taxnomy_group=True, std_to_delete = 0.25)
        self.OtuMf = OtuMf
        self._preproccessed_data = preproccessed_data
        
        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(preproccessed_data, n_components=n_components,
                                                          visualize=False)
        self._pca_obj = pca_obj
        
        #This line ignore the PCA made above disable the line if PCA is needed 
        otu_after_pca_wo_taxonomy = self._preproccessed_data
        
        
        
        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]
        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map

        ids_whole_list = otu_after_pca_wo_taxonomy.index.tolist()

        # ------------------------------------ each TASK creates different tag map --------------------------------
        id_to_tag_map = {}

        tag_map = {'Control': 0, 'GDM': 1}
        if sample == "both":
            T1_ids = [id for id in ids_whole_list if OtuMf.mapping_file["trimester"][id] == '1']
        else:
            T1_ids = [id for id in ids_whole_list if int(OtuMf.mapping_file["trimester"][id]) == 1 and OtuMf.mapping_file["body_site"][id] == sample]
        counter_GDM = 0
        counter_Control = 0
        for id in T1_ids:
            id_to_tag_map[id] = tag_map[OtuMf.mapping_file["Control_GDM"][id]]
        self._ids_list = T1_ids
        self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(self._id_to_tag_map.values())
        classes_sum = [np.sum(np.array(y) == unique_class) for unique_class in
                       np.unique(np.array(y))]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = {i: classes_ratio[i] for i in range(len(classes_ratio))}

        # return the list of features and the list of ids in the same order
        self._feature_list = [self._id_to_features_map[id] for id in self._ids_list]
예제 #4
0
    def _read_file(self, TITLE, PRINT, REG, WEIGHTS, ANNA_PREPROCESS):
        bactria_as_feature_file = 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'

        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        if REG:
            self.reg(features, cols)
        # get single\multiple information
        multiple_samples_info_path = 'mf_merge_ok84_ok93_ok66_69_TreeNuts_controls_271118_040219 post     MG17 07.05.19.csv'
        multiple_samples_info_df = pd.read_csv(multiple_samples_info_path)
        single_or_multiple_list = multiple_samples_info_df[
            'Michael_4_Single_Multiple']
        single_or_multiple_id_list = multiple_samples_info_df['SampleCode']
        single_or_multiple_map = {}
        for id, s_or_m in zip(single_or_multiple_id_list,
                              single_or_multiple_list):
            single_or_multiple_map[id] = s_or_m
        ids_list_wo_multiple = [
            key for key, val in single_or_multiple_map.items()
            if val == 'Single'
        ]
        ids_of_multiple = [
            key for key, val in single_or_multiple_map.items()
            if val == 'Multiple'
        ]
        id_to_single_or_multiple_allergy_map = {}
        for id in ids_list_wo_multiple:
            id_to_single_or_multiple_allergy_map[id] = 0
        for id in ids_of_multiple:
            id_to_single_or_multiple_allergy_map[id] = 1
        self.id_to_single_or_multiple_allergy_map = id_to_single_or_multiple_allergy_map

        # mf_merge_ok84_ok93_ok66_69_TreeNuts_controls_271118_040219 post     MG17 07.05.19.xlsx
        samples_data_file = 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        if ANNA_PREPROCESS:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxonomy_col='Taxonomy',
                                                 taxnomy_level=6)
            # if we want to remove certain type of data according to the features
            # preproccessed_data = preproccessed_data.join(OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner')
            # preproccessed_data = preproccessed_data.loc[
            #    (preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))]
            # preproccessed_data = preproccessed_data.drop(['AllergyType', 'SuccessDescription'], axis=1)
            # mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')]

            mapping_file = OtuMf.mapping_file['AllergyType']
            mapping_disease = {
                'Milk': 0,
                'Tree_nut': 1,  # 'Cashew' + 'Hazelnut' + 'Walnut'
                'Peanut': 2,
                'Sesame': 3
            }
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(
                perform_distance=True,
                level=3,
                preproccessed_data=preproccessed_data,
                mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data

        else:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxnomy_level=6,
                                                 taxonomy_col='Taxonomy',
                                                 preform_taxnomy_group=True)

            self._preproccessed_data = preproccessed_data
            # drow_data(preproccessed_data)
            # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        control = otu_after_pca_wo_taxonomy.index[0:62]  # 'Con'
        self._pca_obj = pca_obj
        # if we want to remove the healthy samples that are used for control
        # otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.drop(preproccessed_data.index[0:62])

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map

        success_tag_column = 'SuccessDescription'
        stages_column = 'TreatmentTimePoint'
        allergan_column = 'AllergyType'
        code_column = 'ParticipentCode'
        ids_list_w_con = otu_after_pca_wo_taxonomy.index.tolist()
        ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop(
            otu_after_pca_wo_taxonomy.index[0:62])

        self._ids_list_w_con = ids_list_w_con
        self._ids_list_wo_con = ids_list_wo_con
        self._ids_list_wo_multiple = [
            id for id in ids_list_wo_multiple if id in ids_list_w_con
        ]

        stages = []

        # ##### separate samples by allergic and healthy==>'Con'
        id_to_binary_health_tag_map = {}
        for sample in ids_list_w_con:
            if sample.startswith('Con'):
                id_to_binary_health_tag_map[sample] = 1
            else:
                id_to_binary_health_tag_map[sample] = 0

        self._id_to_binary_health_tag_map = id_to_binary_health_tag_map

        # ##### separate samples by stage, success of treatment and allergen type
        id_to_success_tag_map = {}
        id_to_stage_map = {}
        id_to_binary_success_tag_map = {}
        id_to_allergy_type_tag_map = {}
        id_to_allergy_number_type_tag_map = {}
        id_to_milk_allergy_tag_map = {}
        allergan_types = set()

        tag_to_allergy_type_map = {
            0: 'Milk',
            1: 'Tree_nut',  # 'Cashew' + 'Hazelnut' + 'Walnut'
            2: 'Peanut',
            3: 'Sesame'
        }  # removed 'Egg' samples

        allergy_type_to_instances_map = {
            'Milk': 0,
            'Tree_nut': 0,
            'Peanut': 0,
            'Sesame': 0
        }  # 'Non': 9 samples, 'Egg': 35 samples
        """
        nuts_samples_list = []
        for sample in ids_list_wo_con:
             a = OtuMf.mapping_file.loc[sample, allergan_column]
             if a == 'Nuts':
                nuts_samples_list.append(sample)
        with open("nuts_samples.txt", "w") as file:
            for l in nuts_samples_list:
                 file.write(l + "\n")
    """
        non_allergy_type_ids = []
        egg_allergy_type_ids = []
        for sample in ids_list_wo_con:
            s = OtuMf.mapping_file.loc[sample, stages_column]
            # stages
            stages.append(s)
            id_to_stage_map[sample] = s
            stage_0_ids = [
                key for key in id_to_stage_map
                if id_to_stage_map[key] == '0_before'
            ]
            self._stage_0_ids = stage_0_ids

            # success
            t = OtuMf.mapping_file.loc[sample, success_tag_column]
            id_to_success_tag_map[sample] = t
            # save tags from k-classes as success(A1)->1 and failure(the rest)->0
            if t == 'A1':
                id_to_binary_success_tag_map[sample] = 1
            else:
                id_to_binary_success_tag_map[sample] = 0

            # allergy type
            a = OtuMf.mapping_file.loc[sample, allergan_column]
            allergan_types.add(a)
            id_to_allergy_type_tag_map[sample] = a

            if a == 'Milk' or a == 'Milk_suspected' or a == 'milk':
                id_to_allergy_number_type_tag_map[sample] = 0
                id_to_milk_allergy_tag_map[sample] = 1
                allergy_type_to_instances_map[
                    'Milk'] = allergy_type_to_instances_map.get('Milk') + 1
            elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts':
                id_to_allergy_number_type_tag_map[sample] = 1
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Tree_nut'] = allergy_type_to_instances_map.get(
                        'Tree_nut') + 1
            elif a == 'Peanut':
                id_to_allergy_number_type_tag_map[sample] = 2
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Peanut'] = allergy_type_to_instances_map.get('Peanut') + 1
            elif a == 'Sesame':
                id_to_allergy_number_type_tag_map[sample] = 3
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Sesame'] = allergy_type_to_instances_map.get('Sesame') + 1
            elif a == 'Egg':
                egg_allergy_type_ids.append(sample)
                # id_to_allergy_number_type_tag_map[sample] = 1
                # id_to_milk_allergy_tag_map[sample] = 0
                # allergy_type_to_instances_map['Egg'] = allergy_type_to_instances_map.get('Egg') + 1
            elif a == 'Non':
                non_allergy_type_ids.append(sample)
                # id_to_allergy_number_type_tag_map[sample] = None
                # id_to_milk_allergy_tag_map[sample] = None
                # allergy_type_to_instances_map['Non'] = allergy_type_to_instances_map.get('Non') + 1
            else:
                print("error in allergy type " + str(sample))

        self._id_wo_non_and_egg_allergy_type_list = [
            x for x in self._ids_list_wo_con
            if x not in non_allergy_type_ids + egg_allergy_type_ids
        ]
        self._tag_to_allergy_type_map = tag_to_allergy_type_map
        self._allergy_type_to_instances_map = allergy_type_to_instances_map
        self._id_to_success_tag_map = id_to_success_tag_map
        self._id_to_stage_map = id_to_stage_map
        self._id_to_binary_success_tag_map = id_to_binary_success_tag_map
        self._id_to_allergy_type_tag_map = id_to_allergy_type_tag_map
        self._id_to_allergy_number_type_tag_map = id_to_allergy_number_type_tag_map
        self._id_to_milk_allergy_tag_map = id_to_milk_allergy_tag_map

        self._ids_list_wo_multiple = [
            id for id in ids_list_wo_multiple
            if id in id_to_allergy_number_type_tag_map.keys()
        ]

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights for types of allergy
        if WEIGHTS:
            total_sum = sum(list(allergy_type_to_instances_map.values()))
            types = list(allergy_type_to_instances_map.keys())
            allergy_type_to_weight_map = {}
            for t in types:
                allergy_type_to_weight_map[
                    t] = total_sum / allergy_type_to_instances_map[t]

            # normalize
            max_weight = max(list(allergy_type_to_weight_map.values()))
            for t in types:
                allergy_type_to_weight_map[t] = allergy_type_to_weight_map.get(
                    t) / max_weight

            # calculate weights for milk vs. other types of allergy
            milk_vs_other_allergy_weight_map = {
                'Other':
                total_sum /
                (total_sum - allergy_type_to_instances_map.get("Milk")),
                'Milk':
                total_sum / allergy_type_to_instances_map.get("Milk")
            }
            # normalize
            max_weight = max(list(milk_vs_other_allergy_weight_map.values()))
            for t in ['Other', 'Milk']:
                milk_vs_other_allergy_weight_map[
                    t] = milk_vs_other_allergy_weight_map.get(t) / max_weight

            # calculate weights for healthy and allergic
            healthy_vs_allergic_weight_map = {
                'Allergic': (len(ids_list_w_con)) / len(ids_list_wo_con),
                'Healthy': (len(ids_list_w_con)) /
                (len(ids_list_w_con) - len(ids_list_wo_con))
            }

            # normalize
            max_weight = max(list(healthy_vs_allergic_weight_map.values()))
            for t in ['Allergic', 'Healthy']:
                healthy_vs_allergic_weight_map[
                    t] = healthy_vs_allergic_weight_map.get(t) / max_weight

            # calculate weights for responding and not (success)
            no_response = list(id_to_binary_success_tag_map.values()).count(0)
            yes_response = list(id_to_binary_success_tag_map.values()).count(1)

            responding_vs_not_weight_map = {
                'No': (len(ids_list_wo_con)) / no_response,
                'Yes': (len(ids_list_wo_con) / yes_response)
            }

            # normalize
            max_weight = max(list(responding_vs_not_weight_map.values()))
            for t in ['No', 'Yes']:
                responding_vs_not_weight_map[
                    t] = responding_vs_not_weight_map.get(t) / max_weight

            # calculate weights for responding and not (prognostic)
            tags = []
            for i in stage_0_ids:
                tags.append(id_to_binary_success_tag_map.get(i))

            no_response = tags.count(0)
            yes_response = tags.count(1)

            prognostic_responding_vs_not_weight_map = {
                'No': (len(stage_0_ids)) / no_response,
                'Yes': (len(stage_0_ids) / yes_response)
            }

            # normalize
            max_weight = max(
                list(prognostic_responding_vs_not_weight_map.values()))
            for t in ['No', 'Yes']:
                prognostic_responding_vs_not_weight_map[
                    t] = prognostic_responding_vs_not_weight_map.get(
                        t) / max_weight

            self._allergy_type_to_weight_map = allergy_type_to_weight_map
            self._milk_vs_other_allergy_weight_map = milk_vs_other_allergy_weight_map
            self._healthy_vs_allergic_weight_map = healthy_vs_allergic_weight_map
            self._responding_vs_not_weight_map = responding_vs_not_weight_map
            self._prognostic_responding_vs_not_weight_map = prognostic_responding_vs_not_weight_map
        """    # count tags in all vs. stage_0
        all_tags = list(id_to_binary_success_tag_map.values())
        print("tags total len: " + str(len(all_tags)) + " pos tags: " + str(all_tags.count(1))
              + " percent: " + str(all_tags.count(1)/len(all_tags)))
        stage_0_tags = [id_to_binary_success_tag_map[id] for id in stage_0_ids if id in id_to_binary_success_tag_map.keys()]
        print("stage 0 tags total len: " + str(len(stage_0_tags)) + " pos tags: " + str(stage_0_tags.count(1))
              + " percent: " + str(stage_0_tags.count(1)/len(stage_0_tags)))
        """

        # return the list of features and the list of ids in the same order
        feature_list = [id_to_features_map[id] for id in ids_list_w_con]
        return ids_list_w_con, ids_list_wo_con, feature_list
예제 #5
0
def prepare_data(n_components=20):
    OtuMf = OtuMfHandler(
        os.path.join(
            SCRIPT_DIR,
            'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'),
        os.path.join(
            SCRIPT_DIR,
            'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'
        ),
        from_QIIME=True,
        id_col='Feature ID',
        taxonomy_col='Taxonomy')
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=True,
                                         taxnomy_level=5,
                                         taxonomy_col='Taxonomy',
                                         preform_taxnomy_group=True)
    otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data,
                                                n_components=n_components,
                                                visualize=True)

    ######## Pre process (Remove control group) ########
    column_to_use_for_filter = 'AllergyTypeData131118'
    OtuMf.mapping_file = OtuMf.mapping_file.loc[
        OtuMf.mapping_file['AllergyTypeData131118'] != 'Con']

    ######## get date of sample in date format ########
    date_of_sample_col = 'Date'
    OtuMf.mapping_file['Date_of_sample'] = OtuMf.mapping_file[
        date_of_sample_col].apply(get_datetime, time_format='%m/%d/%y')

    ######## remove invalid subjects (those who had samples with no dates or bad dates) ########
    # bad dates
    tmp = OtuMf.mapping_file.loc[OtuMf.mapping_file['Date_of_sample'].isin(
        ['1800-01-01', '1900-01-01'])]
    patients_with_bad_date = tmp['PatientNumber210119'].unique()
    # remove bad dates
    OtuMf.mapping_file = OtuMf.mapping_file.loc[
        ~OtuMf.mapping_file['PatientNumber210119'].isin(patients_with_bad_date
                                                        )]

    ######## Calculate time for event ########
    OtuMf.mapping_file['time_for_the_event'] = 9999
    col_to_group_by = 'PatientNumber210119'
    data_grouped = OtuMf.mapping_file.groupby(col_to_group_by)

    for subject_id, subject_data in data_grouped:
        if any(subject_data['SuccessDescription'] == 'A1'):  # Uncensored
            date_of_event = subject_data['Date_of_sample'].max()
            time_for_the_event = date_of_event - subject_data['Date_of_sample']
            tmp_df = OtuMf.mapping_file.loc[subject_data.index]
            tmp_df['time_for_the_event'] = time_for_the_event.apply(get_days)
            OtuMf.mapping_file.update(tmp_df)
        else:  # Censored
            pass

    ######## Filter alergies ########
    # allergy types ['Sesame', 'Peanut', 'Egg', 'Non', 'Walnut', 'Milk', 'Cashew', 'Hazelnut']
    # OtuMf.mapping_file['AllergyTypeData131118'].value_counts()
    # Peanut    134
    # Milk    112
    # Sesame    80
    # Walnut    72
    # Egg    28
    # Cashew    18
    # Hazelnut    9
    # Non    9
    allergy_to_use = ['Peanut']
    OtuMf.mapping_file = OtuMf.mapping_file[
        OtuMf.mapping_file['AllergyTypeData131118'].isin(allergy_to_use)]

    ######## Create inputs ########

    # create groups
    col_to_group_by = 'PatientNumber210119'
    data_grouped = OtuMf.mapping_file.groupby(col_to_group_by)
    censored_data = {}
    not_censored = pd.DataFrame()
    y_for_deep = pd.DataFrame()
    x_for_deep = pd.DataFrame()
    x_for_deep_censored = pd.DataFrame()
    y_for_deep_censored = pd.DataFrame()

    def calculate_y_for_deep_per_row(row):
        a = row.sort_values()
        return a.index[0]

    for subject_id, subject_data in data_grouped:
        if 9999 in subject_data['time_for_the_event'].values:  # censored
            tmp_data = subject_data.join(otu_after_pca_wo_taxonomy)
            tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()]
            if not tmp_data_only_valid.empty:
                x_for_deep_censored = x_for_deep_censored.append(subject_data)

                tmp_data_only_valid.sort_index(by='Date_of_sample',
                                               ascending=True,
                                               inplace=True)
                tmp_data_only_valid['relative_start_date'] = (
                    tmp_data_only_valid['Date_of_sample'] -
                    tmp_data_only_valid['Date_of_sample'].iloc[0]
                ).apply(get_days)
                tmp_data_only_valid['relative_max_date'] = (
                    tmp_data_only_valid['Date_of_sample'].iloc[-1] -
                    tmp_data_only_valid['Date_of_sample']).apply(get_days)
                tmp_data_only_valid['delta_time'] = -1
                tmp_data_only_valid['mse_coeff'] = 0
                tmp_data_only_valid['time_sense_coeff'] = 1
                y_for_deep_censored = y_for_deep_censored.append(
                    tmp_data_only_valid[[
                        'relative_start_date', 'delta_time',
                        'relative_max_date', 'mse_coeff', 'time_sense_coeff'
                    ]])

                # get only the last sample
                censored_data[subject_id] = tmp_data_only_valid.loc[
                    tmp_data_only_valid['relative_max_date'] == min(
                        tmp_data_only_valid['relative_max_date'])]

        else:  # not censored
            before_event_mask = subject_data['time_for_the_event'] > 0
            before_event_subjects = subject_data.loc[before_event_mask]
            if not before_event_subjects.empty:
                not_censored = not_censored.append(before_event_subjects)

                x_for_deep = x_for_deep.append(before_event_subjects)
                before_event_subjects.sort_index(by='time_for_the_event',
                                                 ascending=False,
                                                 inplace=True)
                before_event_subjects[
                    'relative_start_date'] = before_event_subjects[
                        'time_for_the_event'].iloc[0] - before_event_subjects[
                            'time_for_the_event']
                before_event_subjects[
                    'relative_max_date'] = before_event_subjects[
                        'time_for_the_event']
                before_event_subjects['delta_time'] = before_event_subjects[
                    'time_for_the_event']
                before_event_subjects['mse_coeff'] = 1
                before_event_subjects['time_sense_coeff'] = 0
                y_for_deep = y_for_deep.append(before_event_subjects[[
                    'relative_start_date', 'delta_time', 'relative_max_date',
                    'mse_coeff', 'time_sense_coeff'
                ]])

    x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy)
    x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()]
    y_for_deep = y_for_deep.loc[x_for_deep.index]

    x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy)
    x_for_deep_censored = x_for_deep_censored.loc[
        x_for_deep_censored[0].notnull()]
    y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index]

    return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf
예제 #6
0
def predict_get_spearman_value(test_set, regressor):
    test_df = pd.DataFrame(test_set['age_in_days'])
    test_df['predicted'] = regressor.predict(
        test_set.loc[:, test_set.columns != 'age_in_days'])
    spearman_values = use_spearmanr(test_set['age_in_days'].values,
                                    test_df['predicted'].values)
    return test_df, spearman_values


if __name__ == "__main__":
    OtuMf = OtuMfHandler('aging_otu_table.csv', 'mf.csv', from_QIIME=True)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=True,
                                         taxnomy_level=5)
    otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data,
                                             n_components=80)
    # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy)
    # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy)
    merged_data_with_age = otu_after_pca_wo_taxonomy.join(
        OtuMf.mapping_file['age_in_days'])
    merged_data_with_age = merged_data_with_age[
        merged_data_with_age.age_in_days.notnull()]  # remove NaN days

    # create train set and test set
    merged_data_with_age = merged_data_with_age.sample(frac=1)
    train_size = math.ceil(merged_data_with_age.shape[0] * 0.85)
    train_set = merged_data_with_age.iloc[0:train_size]
    test_set = merged_data_with_age.iloc[train_size + 1:]

    train_x_data = train_set.loc[:, train_set.columns != 'age_in_days']
    train_y_values = train_set['age_in_days']
    def _read_file(self, title, bactria_as_feature_file, samples_data_file,
                   allow_printing, perform_anna_preprocess):
        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        if perform_anna_preprocess:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxonomy_col='Taxonomy',
                                                 taxnomy_level=6)
            mapping_file = OtuMf.mapping_file['XXXXX']
            mapping_disease = {
                'a': 0,
                'b': 1,  # 'Cashew' + 'Hazelnut' + 'Walnut'
                'c': 2,
                'd': 3
            }
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(
                perform_distance=True,
                level=self._taxnomy_level,
                preproccessed_data=preproccessed_data,
                mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data
        else:
            preproccessed_data = preprocess_data(
                OtuMf.otu_file,
                visualize_data=False,
                taxnomy_level=self._taxnomy_level,
                taxonomy_col='Taxonomy',
                preform_taxnomy_group=True)

            self._preproccessed_data = preproccessed_data
            # drow_data(preproccessed_data)
            # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        self._pca_obj = pca_obj

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]
        ids_who_has_features = list(id_to_features_map.keys())

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map
        ids_list = otu_after_pca_wo_taxonomy.index.tolist()

        sample_id_to_sample_code_map = {}
        sample_ids = [s for s in OtuMf.mapping_file.index
                      ]  #  if not s.startswith("Con")]
        sample_code = [s for s in OtuMf.mapping_file["SampleCode"]
                       ]  #  if s != "Control"]
        for id, code in zip(sample_ids, sample_code):
            if not id.startswith("Con"):
                sample_id_to_sample_code_map[id] = code

        # ------------------------------------ each TASK creates different tag map --------------------------------
        before_ids = []
        id_to_tag_map = {}

        for id in OtuMf.mapping_file.index:
            before = OtuMf.mapping_file.loc[id, "TreatmentPoint"]
            if before == "before":
                if sample_id_to_sample_code_map[id] in id_to_features_map.keys(
                ):
                    before_ids.append(id)
                else:
                    print(code + " not in id_to_features_map")
            elif before == "Control":
                before_ids.append(id)

        code_list = []
        for id in before_ids:
            if id in sample_id_to_sample_code_map.keys():
                code_list.append(sample_id_to_sample_code_map[id])
            else:
                code_list.append(id)

        # HEALTH_BEFORE_TREATMENT_TASK
        if self._task == "health_before_treatment_task":
            for id, code in zip(before_ids, code_list):
                before = OtuMf.mapping_file.loc[id, "TreatmentPoint"]
                if before == "before":
                    if code in id_to_features_map.keys():
                        id_to_tag_map[code] = 1
                elif before == "Control":
                    id_to_tag_map[id] = 0

            self._ids_list = list(id_to_tag_map.keys())
            """
            # before_ids.remove("382954")
            # before_ids.remove("386137")
            # before_ids.remove("386100")
                    if self._task == "health_before_treatment_task":
            for id in OtuMf.mapping_file.index:
                before = OtuMf.mapping_file.loc[id, "TreatmentPoint"]
                if before == "before":
                    code = sample_id_to_sample_code_map[id]
                    if code in id_to_features_map.keys():
                        before_ids.append(code)
                        id_to_tag_map[code] = 1
                    else:
                        print(code + " not in id_to_features_map")

                elif before == "Control":
                        before_ids.append(id)
                        id_to_tag_map[id] = 0
                else:
                    print(before + " error")
            """

        # ALLERGY_TYPE_BEFORE_TREATMENT_TASK
        elif self._task == "allergy_type_before_treatment_task":
            tag_to_allergy_type_map = {
                0: 'Milk',
                1: 'Tree_nut',  # 'Cashew' + 'Hazelnut' + 'Walnut'
                2: 'Peanut',
                3: 'Sesame'
            }  # removed 'Egg' samples
            for sample, code in zip(before_ids, code_list):
                a = OtuMf.mapping_file.loc[sample, 'AllergyType']
                if a == 'Milk' or a == 'Milk_suspected' or a == 'milk':
                    id_to_tag_map[code] = 0
                elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts':
                    id_to_tag_map[code] = 1
                elif a == 'Peanut':
                    id_to_tag_map[code] = 2
                elif a == 'Sesame':
                    id_to_tag_map[code] = 3
            self._ids_list = [
                id for id in code_list if not id.startswith("Con")
            ]

        self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(id_to_tag_map.values())
        classes_sum = [
            np.sum(np.array(y) == unique_class)
            for unique_class in np.unique(np.array(y))
        ]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = {
            i: classes_ratio[i]
            for i in range(len(classes_ratio))
        }

        # return the list of features and the list of ids in the same order
        feature_list = []
        for id in self._ids_list:
            if id in sample_id_to_sample_code_map.keys():
                feature_list.append(
                    id_to_features_map[sample_id_to_sample_code_map[id]])
            else:
                id_to_features_map[id]
        self._feature_list = feature_list
예제 #8
0
def prepare_data(n_components=20, preform_z_scoring=True, taxnomy_level=6):
    OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'),
                         os.path.join(
                             SCRIPT_DIR,
                             'saliva_samples_mapping_file_231018.csv'),
                         from_QIIME=True)
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         preform_z_scoring,
                                         visualize_data=True,
                                         taxnomy_level=taxnomy_level,
                                         preform_taxnomy_group=True)
    otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data,
                                                n_components=n_components,
                                                visualize=True)

    ######## Pre process (Remove control group) ########
    OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply(
        get_datetime)
    OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file[
        'Mucositis_Start'].apply(get_datetime)
    OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file[
        'Mocosities_start_datetime'] - OtuMf.mapping_file['DATE_datetime']

    OtuMf.mapping_file['time_for_the_event'] = OtuMf.mapping_file[
        'TIME_BEFORE_MOCO_START'].apply(get_days)

    OtuMf.mapping_file['time_for_the_event'][
        OtuMf.mapping_file['Mocosities_start_datetime'] ==
        datetime.datetime.strptime('01/01/1900', '%d/%m/%Y')] = 9999
    # create groups
    data_grouped = OtuMf.mapping_file.groupby('Personal_ID')
    censored_data = {}
    not_censored = pd.DataFrame()
    dilated_df = pd.DataFrame()
    y_for_deep = pd.DataFrame()
    x_for_deep = pd.DataFrame()
    x_for_deep_censored = pd.DataFrame()
    y_for_deep_censored = pd.DataFrame()

    for subject_id, subject_data in data_grouped:
        if 9999 in subject_data['time_for_the_event'].values:  # censored
            tmp_data = subject_data.join(otu_after_pca_wo_taxonomy)
            tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()]
            if not tmp_data_only_valid.empty:
                x_for_deep_censored = x_for_deep_censored.append(subject_data)

                tmp_data_only_valid[
                    'time_before_moco_start_days'] = tmp_data_only_valid[
                        'TIME_BEFORE_MOCO_START'].apply(get_days)
                tmp_data_only_valid.sort_index(
                    by='time_before_moco_start_days',
                    ascending=False,
                    inplace=True)
                tmp_data_only_valid[
                    'relative_start_date'] = tmp_data_only_valid[
                        'time_before_moco_start_days'].iloc[
                            0] - tmp_data_only_valid[
                                'time_before_moco_start_days']
                tmp_data_only_valid['relative_max_date'] = tmp_data_only_valid['relative_start_date'][-1] - \
                                                           tmp_data_only_valid['relative_start_date']
                tmp_data_only_valid['delta_time'] = -1
                tmp_data_only_valid['mse_coeff'] = 0
                tmp_data_only_valid['time_sense_coeff'] = 1
                y_for_deep_censored = y_for_deep_censored.append(
                    tmp_data_only_valid[[
                        'relative_start_date', 'delta_time',
                        'relative_max_date', 'mse_coeff', 'time_sense_coeff'
                    ]])

                # get only the last sample
                censored_data[subject_id] = tmp_data_only_valid.loc[
                    tmp_data_only_valid['TIME_BEFORE_MOCO_START'] == min(
                        tmp_data_only_valid['TIME_BEFORE_MOCO_START'])]

        else:  # not censored
            before_event_mask = subject_data['time_for_the_event'] > 0
            before_event_subjects = subject_data.loc[before_event_mask]
            if not before_event_subjects.empty:
                not_censored = not_censored.append(before_event_subjects)
                dilated_df = dilated_df.append(before_event_subjects)

                x_for_deep = x_for_deep.append(before_event_subjects)
                before_event_subjects[
                    'time_before_moco_start_days'] = before_event_subjects[
                        'TIME_BEFORE_MOCO_START'].apply(get_days)
                before_event_subjects.sort_index(
                    by='time_before_moco_start_days',
                    ascending=False,
                    inplace=True)
                before_event_subjects[
                    'relative_start_date'] = before_event_subjects[
                        'time_before_moco_start_days'].iloc[
                            0] - before_event_subjects[
                                'time_before_moco_start_days']
                before_event_subjects['relative_max_date'] = before_event_subjects['relative_start_date'] + \
                                                             before_event_subjects['time_before_moco_start_days']
                before_event_subjects['delta_time'] = before_event_subjects[
                    'time_for_the_event']
                before_event_subjects['mse_coeff'] = 1
                before_event_subjects['time_sense_coeff'] = 0
                y_for_deep = y_for_deep.append(before_event_subjects[[
                    'relative_start_date', 'delta_time', 'relative_max_date',
                    'mse_coeff', 'time_sense_coeff'
                ]])

    x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy)
    x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()]
    y_for_deep = y_for_deep.loc[x_for_deep.index]

    x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy)
    x_for_deep_censored = x_for_deep_censored.loc[
        x_for_deep_censored[0].notnull()]
    y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index]

    return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored,\
           otu_after_pca_wo_taxonomy, OtuMf, preproccessed_data
예제 #9
0
    def _read_file(self, title, bactria_as_feature_file, samples_data_file,
                   allow_printing, perform_anna_preprocess):
        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        if perform_anna_preprocess:
            preproccessed_data = preprocess_data(OtuMf.otu_file,
                                                 visualize_data=False,
                                                 taxonomy_col='Taxonomy',
                                                 taxnomy_level=6)
            mapping_file = OtuMf.mapping_file['XXXXX']
            mapping_disease = {
                'a': 0,
                'b': 1,  # 'Cashew' + 'Hazelnut' + 'Walnut'
                'c': 2,
                'd': 3
            }
            mapping_file = mapping_file.map(mapping_disease)
            preproccessed_data, mapping_file = distance_learning(
                perform_distance=True,
                level=self._taxnomy_level,
                preproccessed_data=preproccessed_data,
                mapping_file=mapping_file)
            self._preproccessed_data = preproccessed_data
        else:
            preproccessed_data = preprocess_data(
                OtuMf.otu_file,
                visualize_data=False,
                taxnomy_level=self._taxnomy_level,
                taxonomy_col='Taxonomy',
                preform_taxnomy_group=True)

            self._preproccessed_data = preproccessed_data
            # drow_data(preproccessed_data)
            # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        self._pca_obj = pca_obj

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map
        ids_list = otu_after_pca_wo_taxonomy.index.tolist()
        ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop(
            otu_after_pca_wo_taxonomy.index[0:62])

        if self._task == "health task":
            self._ids_list = ids_list
            id_to_tag_map = {}
            for sample in ids_list:
                if sample.startswith('Con'):
                    id_to_tag_map[sample] = 1
                else:
                    id_to_tag_map[sample] = 0
            self._id_to_tag_map = id_to_tag_map

        if self._task == "prognostic task":
            treatment_point_column = 'TreatmentPoint'
            before_treatment_ids = []
            for sample in ids_list_wo_con:
                s = OtuMf.mapping_file.loc[sample, treatment_point_column]
                if s == "before":
                    before_treatment_ids.append(sample)

            self._ids_list = list(before_treatment_ids)
            success_column = 'SuccessDescription'
            id_to_tag_map = {}
            for sample in before_treatment_ids:
                t = OtuMf.mapping_file.loc[sample, success_column]
                id_to_tag_map[sample] = t
                if t == 'A1':
                    id_to_tag_map[sample] = 1
                else:
                    id_to_tag_map[sample] = 0
            self._id_to_tag_map = id_to_tag_map

        if self._task == "diagnostics task":
            self._ids_list = list(ids_list_wo_con)
            success_column = 'SuccessDescription'
            id_to_tag_map = {}
            for sample in ids_list_wo_con:
                t = OtuMf.mapping_file.loc[sample, success_column]
                id_to_tag_map[sample] = t
                if t == 'A1':
                    id_to_tag_map[sample] = 1
                else:
                    id_to_tag_map[sample] = 0
            self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(id_to_tag_map.values())
        classes_sum = [
            np.sum(np.array(y) == unique_class)
            for unique_class in np.unique(np.array(y))
        ]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = {
            i: classes_ratio[i]
            for i in range(len(classes_ratio))
        }

        # return the list of features and the list of ids in the same order
        feature_list = [id_to_features_map[id] for id in ids_list]
        self._feature_list = feature_list