def distance_learning(perform_distance, level, preproccessed_data, mapping_file): if perform_distance: cols = [ col for col in preproccessed_data.columns if preproccessed_data[col].nunique() != 1 ] dict_bact = {'else': []} for col in preproccessed_data[cols]: col_name = preproccessed_data[col].name.split(';') bact_level = level - 1 if len(col_name) > bact_level: if col_name[bact_level] in dict_bact: dict_bact[col_name[bact_level]].append( preproccessed_data[col].name) else: dict_bact[col_name[bact_level]] = [ preproccessed_data[col].name ] else: dict_bact['else'].append(preproccessed_data[col].name) #print(dict_bact) new_df = pd.DataFrame(index=preproccessed_data.index) col = 0 for key, values in dict_bact.items(): if values: new_data = preproccessed_data[values] pca = PCA(n_components=min( round(new_data.shape[1] / 2) + 1, new_data.shape[0])) pca.fit(new_data) sum = 0 num_comp = 0 for (i, component) in enumerate(pca.explained_variance_ratio_): if sum <= 0.5: sum += component else: num_comp = i break if num_comp == 0: num_comp += 1 # new otu_after_pca_new, pca_obj, pca_str = apply_pca( new_data, n_components=num_comp) # old # otu_after_pca_new, pca_components = apply_pca(new_data, n_components=num_comp) for j in range(otu_after_pca_new.shape[1]): if key == 'else': new_df['else;'] = otu_after_pca_new[j] else: new_df[str(values[0][0:values[0].find(key) + len(key)]) + '_' + str(j)] = otu_after_pca_new[j] col += num_comp return new_df, mapping_file else: return preproccessed_data, mapping_file
def get_days(days_datetime): return days_datetime.days n_components = 20 OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'), os.path.join(SCRIPT_DIR, 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days']) # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']]) # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days # OtuMf.mapping_file.apply(lambda x: -999 if x['Mucositis_Start'] is None else (datetime.datetime.strptime(x['DATE'], '%d/%m/%Y') - datetime.datetime.strptime(x['Mucositis_Start'], '%d/%m/%Y')).days) OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply( get_datetime) OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file[ 'Mucositis_Start'].apply(get_datetime) OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file[
def _read_file(self, title, bactria_as_feature_file, samples_data_file, allow_printing, perform_anna_preprocess, visualize_pre, re_arange): sample = "SALIVA" OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=False, id_col='ID', taxonomy_col='taxonomy') #rare_bacteria = self.find_rare_bacteria(OtuMf) #OtuMf = self.drop_rare_bacteria(rare_bacteria, OtuMf) OtuMf = self.remove_duplicate(OtuMf) OtuMf = self.rearange_data(OtuMf, re_arange) #OtuMf.otu_file.T.to_csv("GDM_OTU_rmv_dup_arrange.csv") OtuMf.mapping_file.to_csv("GDM_tag_rmv_dup.csv") #returnmapping_file return if perform_anna_preprocess: preproccessed_data = preprocess_data(OtuMf.otu_file.T, visualize_data=False, taxonomy_col='taxonomy', taxnomy_level=8) mapping_file = OtuMf.mapping_file['Control_GDM'] mapping_disease = {'Control': 0, 'GDM': 1} mapping_file = mapping_file.map(mapping_disease) preproccessed_data, mapping_file = distance_learning(perform_distance=True, level=4, preproccessed_data=preproccessed_data, mapping_file=mapping_file) self._preproccessed_data = preproccessed_data self._preproccessed_data.to_csv('anna_pca_old_loader.csv') else: if re_arange != 0: OtuMf = self.rearange_data(OtuMf, re_arange) preproccessed_data = preprocess_data(OtuMf.otu_file.T, visualize_data=visualize_pre, taxnomy_level=self._taxnomy_level, taxonomy_col='taxonomy', preform_taxnomy_group=True, std_to_delete = 0.25) self.OtuMf = OtuMf self._preproccessed_data = preproccessed_data otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) self._pca_obj = pca_obj #This line ignore the PCA made above disable the line if PCA is needed otu_after_pca_wo_taxonomy = self._preproccessed_data index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(otu_after_pca_wo_taxonomy.values): id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i] self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map ids_whole_list = otu_after_pca_wo_taxonomy.index.tolist() # ------------------------------------ each TASK creates different tag map -------------------------------- id_to_tag_map = {} tag_map = {'Control': 0, 'GDM': 1} if sample == "both": T1_ids = [id for id in ids_whole_list if OtuMf.mapping_file["trimester"][id] == '1'] else: T1_ids = [id for id in ids_whole_list if int(OtuMf.mapping_file["trimester"][id]) == 1 and OtuMf.mapping_file["body_site"][id] == sample] counter_GDM = 0 counter_Control = 0 for id in T1_ids: id_to_tag_map[id] = tag_map[OtuMf.mapping_file["Control_GDM"][id]] self._ids_list = T1_ids self._id_to_tag_map = id_to_tag_map # -------------------------------------------- weights !-------------------------------------------- # calculate weights y = list(self._id_to_tag_map.values()) classes_sum = [np.sum(np.array(y) == unique_class) for unique_class in np.unique(np.array(y))] classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum] weights = [classes_ratio[a] for a in np.array(y)] self._weight_map = {i: classes_ratio[i] for i in range(len(classes_ratio))} # return the list of features and the list of ids in the same order self._feature_list = [self._id_to_features_map[id] for id in self._ids_list]
def _read_file(self, TITLE, PRINT, REG, WEIGHTS, ANNA_PREPROCESS): bactria_as_feature_file = 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv' features = pd.read_csv(bactria_as_feature_file, header=1) cols = list(features.columns) # remove non-numeric values cols.remove('Feature ID') cols.remove('Taxonomy') if REG: self.reg(features, cols) # get single\multiple information multiple_samples_info_path = 'mf_merge_ok84_ok93_ok66_69_TreeNuts_controls_271118_040219 post MG17 07.05.19.csv' multiple_samples_info_df = pd.read_csv(multiple_samples_info_path) single_or_multiple_list = multiple_samples_info_df[ 'Michael_4_Single_Multiple'] single_or_multiple_id_list = multiple_samples_info_df['SampleCode'] single_or_multiple_map = {} for id, s_or_m in zip(single_or_multiple_id_list, single_or_multiple_list): single_or_multiple_map[id] = s_or_m ids_list_wo_multiple = [ key for key, val in single_or_multiple_map.items() if val == 'Single' ] ids_of_multiple = [ key for key, val in single_or_multiple_map.items() if val == 'Multiple' ] id_to_single_or_multiple_allergy_map = {} for id in ids_list_wo_multiple: id_to_single_or_multiple_allergy_map[id] = 0 for id in ids_of_multiple: id_to_single_or_multiple_allergy_map[id] = 1 self.id_to_single_or_multiple_allergy_map = id_to_single_or_multiple_allergy_map # mf_merge_ok84_ok93_ok66_69_TreeNuts_controls_271118_040219 post MG17 07.05.19.xlsx samples_data_file = 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv' OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') if ANNA_PREPROCESS: preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxonomy_col='Taxonomy', taxnomy_level=6) # if we want to remove certain type of data according to the features # preproccessed_data = preproccessed_data.join(OtuMf.mapping_file[['AllergyType', 'SuccessDescription']], how='inner') # preproccessed_data = preproccessed_data.loc[ # (preproccessed_data['AllergyType'] == 'Milk') | ((preproccessed_data['AllergyType'] == 'Peanut'))] # preproccessed_data = preproccessed_data.drop(['AllergyType', 'SuccessDescription'], axis=1) # mapping_file = OtuMf.mapping_file.loc[(OtuMf.mapping_file['AllergyType'] == 'Milk') | (OtuMf.mapping_file['AllergyType'] == 'Peanut')] mapping_file = OtuMf.mapping_file['AllergyType'] mapping_disease = { 'Milk': 0, 'Tree_nut': 1, # 'Cashew' + 'Hazelnut' + 'Walnut' 'Peanut': 2, 'Sesame': 3 } mapping_file = mapping_file.map(mapping_disease) preproccessed_data, mapping_file = distance_learning( perform_distance=True, level=3, preproccessed_data=preproccessed_data, mapping_file=mapping_file) self._preproccessed_data = preproccessed_data else: preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=6, taxonomy_col='Taxonomy', preform_taxnomy_group=True) self._preproccessed_data = preproccessed_data # drow_data(preproccessed_data) # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=n_components, visualize=False) control = otu_after_pca_wo_taxonomy.index[0:62] # 'Con' self._pca_obj = pca_obj # if we want to remove the healthy samples that are used for control # otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.drop(preproccessed_data.index[0:62]) index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(otu_after_pca_wo_taxonomy.values): id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i] self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map success_tag_column = 'SuccessDescription' stages_column = 'TreatmentTimePoint' allergan_column = 'AllergyType' code_column = 'ParticipentCode' ids_list_w_con = otu_after_pca_wo_taxonomy.index.tolist() ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop( otu_after_pca_wo_taxonomy.index[0:62]) self._ids_list_w_con = ids_list_w_con self._ids_list_wo_con = ids_list_wo_con self._ids_list_wo_multiple = [ id for id in ids_list_wo_multiple if id in ids_list_w_con ] stages = [] # ##### separate samples by allergic and healthy==>'Con' id_to_binary_health_tag_map = {} for sample in ids_list_w_con: if sample.startswith('Con'): id_to_binary_health_tag_map[sample] = 1 else: id_to_binary_health_tag_map[sample] = 0 self._id_to_binary_health_tag_map = id_to_binary_health_tag_map # ##### separate samples by stage, success of treatment and allergen type id_to_success_tag_map = {} id_to_stage_map = {} id_to_binary_success_tag_map = {} id_to_allergy_type_tag_map = {} id_to_allergy_number_type_tag_map = {} id_to_milk_allergy_tag_map = {} allergan_types = set() tag_to_allergy_type_map = { 0: 'Milk', 1: 'Tree_nut', # 'Cashew' + 'Hazelnut' + 'Walnut' 2: 'Peanut', 3: 'Sesame' } # removed 'Egg' samples allergy_type_to_instances_map = { 'Milk': 0, 'Tree_nut': 0, 'Peanut': 0, 'Sesame': 0 } # 'Non': 9 samples, 'Egg': 35 samples """ nuts_samples_list = [] for sample in ids_list_wo_con: a = OtuMf.mapping_file.loc[sample, allergan_column] if a == 'Nuts': nuts_samples_list.append(sample) with open("nuts_samples.txt", "w") as file: for l in nuts_samples_list: file.write(l + "\n") """ non_allergy_type_ids = [] egg_allergy_type_ids = [] for sample in ids_list_wo_con: s = OtuMf.mapping_file.loc[sample, stages_column] # stages stages.append(s) id_to_stage_map[sample] = s stage_0_ids = [ key for key in id_to_stage_map if id_to_stage_map[key] == '0_before' ] self._stage_0_ids = stage_0_ids # success t = OtuMf.mapping_file.loc[sample, success_tag_column] id_to_success_tag_map[sample] = t # save tags from k-classes as success(A1)->1 and failure(the rest)->0 if t == 'A1': id_to_binary_success_tag_map[sample] = 1 else: id_to_binary_success_tag_map[sample] = 0 # allergy type a = OtuMf.mapping_file.loc[sample, allergan_column] allergan_types.add(a) id_to_allergy_type_tag_map[sample] = a if a == 'Milk' or a == 'Milk_suspected' or a == 'milk': id_to_allergy_number_type_tag_map[sample] = 0 id_to_milk_allergy_tag_map[sample] = 1 allergy_type_to_instances_map[ 'Milk'] = allergy_type_to_instances_map.get('Milk') + 1 elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts': id_to_allergy_number_type_tag_map[sample] = 1 id_to_milk_allergy_tag_map[sample] = 0 allergy_type_to_instances_map[ 'Tree_nut'] = allergy_type_to_instances_map.get( 'Tree_nut') + 1 elif a == 'Peanut': id_to_allergy_number_type_tag_map[sample] = 2 id_to_milk_allergy_tag_map[sample] = 0 allergy_type_to_instances_map[ 'Peanut'] = allergy_type_to_instances_map.get('Peanut') + 1 elif a == 'Sesame': id_to_allergy_number_type_tag_map[sample] = 3 id_to_milk_allergy_tag_map[sample] = 0 allergy_type_to_instances_map[ 'Sesame'] = allergy_type_to_instances_map.get('Sesame') + 1 elif a == 'Egg': egg_allergy_type_ids.append(sample) # id_to_allergy_number_type_tag_map[sample] = 1 # id_to_milk_allergy_tag_map[sample] = 0 # allergy_type_to_instances_map['Egg'] = allergy_type_to_instances_map.get('Egg') + 1 elif a == 'Non': non_allergy_type_ids.append(sample) # id_to_allergy_number_type_tag_map[sample] = None # id_to_milk_allergy_tag_map[sample] = None # allergy_type_to_instances_map['Non'] = allergy_type_to_instances_map.get('Non') + 1 else: print("error in allergy type " + str(sample)) self._id_wo_non_and_egg_allergy_type_list = [ x for x in self._ids_list_wo_con if x not in non_allergy_type_ids + egg_allergy_type_ids ] self._tag_to_allergy_type_map = tag_to_allergy_type_map self._allergy_type_to_instances_map = allergy_type_to_instances_map self._id_to_success_tag_map = id_to_success_tag_map self._id_to_stage_map = id_to_stage_map self._id_to_binary_success_tag_map = id_to_binary_success_tag_map self._id_to_allergy_type_tag_map = id_to_allergy_type_tag_map self._id_to_allergy_number_type_tag_map = id_to_allergy_number_type_tag_map self._id_to_milk_allergy_tag_map = id_to_milk_allergy_tag_map self._ids_list_wo_multiple = [ id for id in ids_list_wo_multiple if id in id_to_allergy_number_type_tag_map.keys() ] # -------------------------------------------- weights !-------------------------------------------- # calculate weights for types of allergy if WEIGHTS: total_sum = sum(list(allergy_type_to_instances_map.values())) types = list(allergy_type_to_instances_map.keys()) allergy_type_to_weight_map = {} for t in types: allergy_type_to_weight_map[ t] = total_sum / allergy_type_to_instances_map[t] # normalize max_weight = max(list(allergy_type_to_weight_map.values())) for t in types: allergy_type_to_weight_map[t] = allergy_type_to_weight_map.get( t) / max_weight # calculate weights for milk vs. other types of allergy milk_vs_other_allergy_weight_map = { 'Other': total_sum / (total_sum - allergy_type_to_instances_map.get("Milk")), 'Milk': total_sum / allergy_type_to_instances_map.get("Milk") } # normalize max_weight = max(list(milk_vs_other_allergy_weight_map.values())) for t in ['Other', 'Milk']: milk_vs_other_allergy_weight_map[ t] = milk_vs_other_allergy_weight_map.get(t) / max_weight # calculate weights for healthy and allergic healthy_vs_allergic_weight_map = { 'Allergic': (len(ids_list_w_con)) / len(ids_list_wo_con), 'Healthy': (len(ids_list_w_con)) / (len(ids_list_w_con) - len(ids_list_wo_con)) } # normalize max_weight = max(list(healthy_vs_allergic_weight_map.values())) for t in ['Allergic', 'Healthy']: healthy_vs_allergic_weight_map[ t] = healthy_vs_allergic_weight_map.get(t) / max_weight # calculate weights for responding and not (success) no_response = list(id_to_binary_success_tag_map.values()).count(0) yes_response = list(id_to_binary_success_tag_map.values()).count(1) responding_vs_not_weight_map = { 'No': (len(ids_list_wo_con)) / no_response, 'Yes': (len(ids_list_wo_con) / yes_response) } # normalize max_weight = max(list(responding_vs_not_weight_map.values())) for t in ['No', 'Yes']: responding_vs_not_weight_map[ t] = responding_vs_not_weight_map.get(t) / max_weight # calculate weights for responding and not (prognostic) tags = [] for i in stage_0_ids: tags.append(id_to_binary_success_tag_map.get(i)) no_response = tags.count(0) yes_response = tags.count(1) prognostic_responding_vs_not_weight_map = { 'No': (len(stage_0_ids)) / no_response, 'Yes': (len(stage_0_ids) / yes_response) } # normalize max_weight = max( list(prognostic_responding_vs_not_weight_map.values())) for t in ['No', 'Yes']: prognostic_responding_vs_not_weight_map[ t] = prognostic_responding_vs_not_weight_map.get( t) / max_weight self._allergy_type_to_weight_map = allergy_type_to_weight_map self._milk_vs_other_allergy_weight_map = milk_vs_other_allergy_weight_map self._healthy_vs_allergic_weight_map = healthy_vs_allergic_weight_map self._responding_vs_not_weight_map = responding_vs_not_weight_map self._prognostic_responding_vs_not_weight_map = prognostic_responding_vs_not_weight_map """ # count tags in all vs. stage_0 all_tags = list(id_to_binary_success_tag_map.values()) print("tags total len: " + str(len(all_tags)) + " pos tags: " + str(all_tags.count(1)) + " percent: " + str(all_tags.count(1)/len(all_tags))) stage_0_tags = [id_to_binary_success_tag_map[id] for id in stage_0_ids if id in id_to_binary_success_tag_map.keys()] print("stage 0 tags total len: " + str(len(stage_0_tags)) + " pos tags: " + str(stage_0_tags.count(1)) + " percent: " + str(stage_0_tags.count(1)/len(stage_0_tags))) """ # return the list of features and the list of ids in the same order feature_list = [id_to_features_map[id] for id in ids_list_w_con] return ids_list_w_con, ids_list_wo_con, feature_list
def prepare_data(n_components=20): OtuMf = OtuMfHandler( os.path.join( SCRIPT_DIR, 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'), os.path.join( SCRIPT_DIR, 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv' ), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5, taxonomy_col='Taxonomy', preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=True) ######## Pre process (Remove control group) ######## column_to_use_for_filter = 'AllergyTypeData131118' OtuMf.mapping_file = OtuMf.mapping_file.loc[ OtuMf.mapping_file['AllergyTypeData131118'] != 'Con'] ######## get date of sample in date format ######## date_of_sample_col = 'Date' OtuMf.mapping_file['Date_of_sample'] = OtuMf.mapping_file[ date_of_sample_col].apply(get_datetime, time_format='%m/%d/%y') ######## remove invalid subjects (those who had samples with no dates or bad dates) ######## # bad dates tmp = OtuMf.mapping_file.loc[OtuMf.mapping_file['Date_of_sample'].isin( ['1800-01-01', '1900-01-01'])] patients_with_bad_date = tmp['PatientNumber210119'].unique() # remove bad dates OtuMf.mapping_file = OtuMf.mapping_file.loc[ ~OtuMf.mapping_file['PatientNumber210119'].isin(patients_with_bad_date )] ######## Calculate time for event ######## OtuMf.mapping_file['time_for_the_event'] = 9999 col_to_group_by = 'PatientNumber210119' data_grouped = OtuMf.mapping_file.groupby(col_to_group_by) for subject_id, subject_data in data_grouped: if any(subject_data['SuccessDescription'] == 'A1'): # Uncensored date_of_event = subject_data['Date_of_sample'].max() time_for_the_event = date_of_event - subject_data['Date_of_sample'] tmp_df = OtuMf.mapping_file.loc[subject_data.index] tmp_df['time_for_the_event'] = time_for_the_event.apply(get_days) OtuMf.mapping_file.update(tmp_df) else: # Censored pass ######## Filter alergies ######## # allergy types ['Sesame', 'Peanut', 'Egg', 'Non', 'Walnut', 'Milk', 'Cashew', 'Hazelnut'] # OtuMf.mapping_file['AllergyTypeData131118'].value_counts() # Peanut 134 # Milk 112 # Sesame 80 # Walnut 72 # Egg 28 # Cashew 18 # Hazelnut 9 # Non 9 allergy_to_use = ['Peanut'] OtuMf.mapping_file = OtuMf.mapping_file[ OtuMf.mapping_file['AllergyTypeData131118'].isin(allergy_to_use)] ######## Create inputs ######## # create groups col_to_group_by = 'PatientNumber210119' data_grouped = OtuMf.mapping_file.groupby(col_to_group_by) censored_data = {} not_censored = pd.DataFrame() y_for_deep = pd.DataFrame() x_for_deep = pd.DataFrame() x_for_deep_censored = pd.DataFrame() y_for_deep_censored = pd.DataFrame() def calculate_y_for_deep_per_row(row): a = row.sort_values() return a.index[0] for subject_id, subject_data in data_grouped: if 9999 in subject_data['time_for_the_event'].values: # censored tmp_data = subject_data.join(otu_after_pca_wo_taxonomy) tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()] if not tmp_data_only_valid.empty: x_for_deep_censored = x_for_deep_censored.append(subject_data) tmp_data_only_valid.sort_index(by='Date_of_sample', ascending=True, inplace=True) tmp_data_only_valid['relative_start_date'] = ( tmp_data_only_valid['Date_of_sample'] - tmp_data_only_valid['Date_of_sample'].iloc[0] ).apply(get_days) tmp_data_only_valid['relative_max_date'] = ( tmp_data_only_valid['Date_of_sample'].iloc[-1] - tmp_data_only_valid['Date_of_sample']).apply(get_days) tmp_data_only_valid['delta_time'] = -1 tmp_data_only_valid['mse_coeff'] = 0 tmp_data_only_valid['time_sense_coeff'] = 1 y_for_deep_censored = y_for_deep_censored.append( tmp_data_only_valid[[ 'relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff' ]]) # get only the last sample censored_data[subject_id] = tmp_data_only_valid.loc[ tmp_data_only_valid['relative_max_date'] == min( tmp_data_only_valid['relative_max_date'])] else: # not censored before_event_mask = subject_data['time_for_the_event'] > 0 before_event_subjects = subject_data.loc[before_event_mask] if not before_event_subjects.empty: not_censored = not_censored.append(before_event_subjects) x_for_deep = x_for_deep.append(before_event_subjects) before_event_subjects.sort_index(by='time_for_the_event', ascending=False, inplace=True) before_event_subjects[ 'relative_start_date'] = before_event_subjects[ 'time_for_the_event'].iloc[0] - before_event_subjects[ 'time_for_the_event'] before_event_subjects[ 'relative_max_date'] = before_event_subjects[ 'time_for_the_event'] before_event_subjects['delta_time'] = before_event_subjects[ 'time_for_the_event'] before_event_subjects['mse_coeff'] = 1 before_event_subjects['time_sense_coeff'] = 0 y_for_deep = y_for_deep.append(before_event_subjects[[ 'relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff' ]]) x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy) x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()] y_for_deep = y_for_deep.loc[x_for_deep.index] x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy) x_for_deep_censored = x_for_deep_censored.loc[ x_for_deep_censored[0].notnull()] y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index] return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf
def predict_get_spearman_value(test_set, regressor): test_df = pd.DataFrame(test_set['age_in_days']) test_df['predicted'] = regressor.predict( test_set.loc[:, test_set.columns != 'age_in_days']) spearman_values = use_spearmanr(test_set['age_in_days'].values, test_df['predicted'].values) return test_df, spearman_values if __name__ == "__main__": OtuMf = OtuMfHandler('aging_otu_table.csv', 'mf.csv', from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=80) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) merged_data_with_age = otu_after_pca_wo_taxonomy.join( OtuMf.mapping_file['age_in_days']) merged_data_with_age = merged_data_with_age[ merged_data_with_age.age_in_days.notnull()] # remove NaN days # create train set and test set merged_data_with_age = merged_data_with_age.sample(frac=1) train_size = math.ceil(merged_data_with_age.shape[0] * 0.85) train_set = merged_data_with_age.iloc[0:train_size] test_set = merged_data_with_age.iloc[train_size + 1:] train_x_data = train_set.loc[:, train_set.columns != 'age_in_days'] train_y_values = train_set['age_in_days']
def _read_file(self, title, bactria_as_feature_file, samples_data_file, allow_printing, perform_anna_preprocess): features = pd.read_csv(bactria_as_feature_file, header=1) cols = list(features.columns) # remove non-numeric values cols.remove('Feature ID') cols.remove('Taxonomy') OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') if perform_anna_preprocess: preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxonomy_col='Taxonomy', taxnomy_level=6) mapping_file = OtuMf.mapping_file['XXXXX'] mapping_disease = { 'a': 0, 'b': 1, # 'Cashew' + 'Hazelnut' + 'Walnut' 'c': 2, 'd': 3 } mapping_file = mapping_file.map(mapping_disease) preproccessed_data, mapping_file = distance_learning( perform_distance=True, level=self._taxnomy_level, preproccessed_data=preproccessed_data, mapping_file=mapping_file) self._preproccessed_data = preproccessed_data else: preproccessed_data = preprocess_data( OtuMf.otu_file, visualize_data=False, taxnomy_level=self._taxnomy_level, taxonomy_col='Taxonomy', preform_taxnomy_group=True) self._preproccessed_data = preproccessed_data # drow_data(preproccessed_data) # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=n_components, visualize=False) self._pca_obj = pca_obj index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(otu_after_pca_wo_taxonomy.values): id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i] ids_who_has_features = list(id_to_features_map.keys()) self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map ids_list = otu_after_pca_wo_taxonomy.index.tolist() sample_id_to_sample_code_map = {} sample_ids = [s for s in OtuMf.mapping_file.index ] # if not s.startswith("Con")] sample_code = [s for s in OtuMf.mapping_file["SampleCode"] ] # if s != "Control"] for id, code in zip(sample_ids, sample_code): if not id.startswith("Con"): sample_id_to_sample_code_map[id] = code # ------------------------------------ each TASK creates different tag map -------------------------------- before_ids = [] id_to_tag_map = {} for id in OtuMf.mapping_file.index: before = OtuMf.mapping_file.loc[id, "TreatmentPoint"] if before == "before": if sample_id_to_sample_code_map[id] in id_to_features_map.keys( ): before_ids.append(id) else: print(code + " not in id_to_features_map") elif before == "Control": before_ids.append(id) code_list = [] for id in before_ids: if id in sample_id_to_sample_code_map.keys(): code_list.append(sample_id_to_sample_code_map[id]) else: code_list.append(id) # HEALTH_BEFORE_TREATMENT_TASK if self._task == "health_before_treatment_task": for id, code in zip(before_ids, code_list): before = OtuMf.mapping_file.loc[id, "TreatmentPoint"] if before == "before": if code in id_to_features_map.keys(): id_to_tag_map[code] = 1 elif before == "Control": id_to_tag_map[id] = 0 self._ids_list = list(id_to_tag_map.keys()) """ # before_ids.remove("382954") # before_ids.remove("386137") # before_ids.remove("386100") if self._task == "health_before_treatment_task": for id in OtuMf.mapping_file.index: before = OtuMf.mapping_file.loc[id, "TreatmentPoint"] if before == "before": code = sample_id_to_sample_code_map[id] if code in id_to_features_map.keys(): before_ids.append(code) id_to_tag_map[code] = 1 else: print(code + " not in id_to_features_map") elif before == "Control": before_ids.append(id) id_to_tag_map[id] = 0 else: print(before + " error") """ # ALLERGY_TYPE_BEFORE_TREATMENT_TASK elif self._task == "allergy_type_before_treatment_task": tag_to_allergy_type_map = { 0: 'Milk', 1: 'Tree_nut', # 'Cashew' + 'Hazelnut' + 'Walnut' 2: 'Peanut', 3: 'Sesame' } # removed 'Egg' samples for sample, code in zip(before_ids, code_list): a = OtuMf.mapping_file.loc[sample, 'AllergyType'] if a == 'Milk' or a == 'Milk_suspected' or a == 'milk': id_to_tag_map[code] = 0 elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts': id_to_tag_map[code] = 1 elif a == 'Peanut': id_to_tag_map[code] = 2 elif a == 'Sesame': id_to_tag_map[code] = 3 self._ids_list = [ id for id in code_list if not id.startswith("Con") ] self._id_to_tag_map = id_to_tag_map # -------------------------------------------- weights !-------------------------------------------- # calculate weights y = list(id_to_tag_map.values()) classes_sum = [ np.sum(np.array(y) == unique_class) for unique_class in np.unique(np.array(y)) ] classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum] weights = [classes_ratio[a] for a in np.array(y)] self._weight_map = { i: classes_ratio[i] for i in range(len(classes_ratio)) } # return the list of features and the list of ids in the same order feature_list = [] for id in self._ids_list: if id in sample_id_to_sample_code_map.keys(): feature_list.append( id_to_features_map[sample_id_to_sample_code_map[id]]) else: id_to_features_map[id] self._feature_list = feature_list
def prepare_data(n_components=20, preform_z_scoring=True, taxnomy_level=6): OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'), os.path.join( SCRIPT_DIR, 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, preform_z_scoring, visualize_data=True, taxnomy_level=taxnomy_level, preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=True) ######## Pre process (Remove control group) ######## OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply( get_datetime) OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file[ 'Mucositis_Start'].apply(get_datetime) OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file[ 'Mocosities_start_datetime'] - OtuMf.mapping_file['DATE_datetime'] OtuMf.mapping_file['time_for_the_event'] = OtuMf.mapping_file[ 'TIME_BEFORE_MOCO_START'].apply(get_days) OtuMf.mapping_file['time_for_the_event'][ OtuMf.mapping_file['Mocosities_start_datetime'] == datetime.datetime.strptime('01/01/1900', '%d/%m/%Y')] = 9999 # create groups data_grouped = OtuMf.mapping_file.groupby('Personal_ID') censored_data = {} not_censored = pd.DataFrame() dilated_df = pd.DataFrame() y_for_deep = pd.DataFrame() x_for_deep = pd.DataFrame() x_for_deep_censored = pd.DataFrame() y_for_deep_censored = pd.DataFrame() for subject_id, subject_data in data_grouped: if 9999 in subject_data['time_for_the_event'].values: # censored tmp_data = subject_data.join(otu_after_pca_wo_taxonomy) tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()] if not tmp_data_only_valid.empty: x_for_deep_censored = x_for_deep_censored.append(subject_data) tmp_data_only_valid[ 'time_before_moco_start_days'] = tmp_data_only_valid[ 'TIME_BEFORE_MOCO_START'].apply(get_days) tmp_data_only_valid.sort_index( by='time_before_moco_start_days', ascending=False, inplace=True) tmp_data_only_valid[ 'relative_start_date'] = tmp_data_only_valid[ 'time_before_moco_start_days'].iloc[ 0] - tmp_data_only_valid[ 'time_before_moco_start_days'] tmp_data_only_valid['relative_max_date'] = tmp_data_only_valid['relative_start_date'][-1] - \ tmp_data_only_valid['relative_start_date'] tmp_data_only_valid['delta_time'] = -1 tmp_data_only_valid['mse_coeff'] = 0 tmp_data_only_valid['time_sense_coeff'] = 1 y_for_deep_censored = y_for_deep_censored.append( tmp_data_only_valid[[ 'relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff' ]]) # get only the last sample censored_data[subject_id] = tmp_data_only_valid.loc[ tmp_data_only_valid['TIME_BEFORE_MOCO_START'] == min( tmp_data_only_valid['TIME_BEFORE_MOCO_START'])] else: # not censored before_event_mask = subject_data['time_for_the_event'] > 0 before_event_subjects = subject_data.loc[before_event_mask] if not before_event_subjects.empty: not_censored = not_censored.append(before_event_subjects) dilated_df = dilated_df.append(before_event_subjects) x_for_deep = x_for_deep.append(before_event_subjects) before_event_subjects[ 'time_before_moco_start_days'] = before_event_subjects[ 'TIME_BEFORE_MOCO_START'].apply(get_days) before_event_subjects.sort_index( by='time_before_moco_start_days', ascending=False, inplace=True) before_event_subjects[ 'relative_start_date'] = before_event_subjects[ 'time_before_moco_start_days'].iloc[ 0] - before_event_subjects[ 'time_before_moco_start_days'] before_event_subjects['relative_max_date'] = before_event_subjects['relative_start_date'] + \ before_event_subjects['time_before_moco_start_days'] before_event_subjects['delta_time'] = before_event_subjects[ 'time_for_the_event'] before_event_subjects['mse_coeff'] = 1 before_event_subjects['time_sense_coeff'] = 0 y_for_deep = y_for_deep.append(before_event_subjects[[ 'relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff' ]]) x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy) x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()] y_for_deep = y_for_deep.loc[x_for_deep.index] x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy) x_for_deep_censored = x_for_deep_censored.loc[ x_for_deep_censored[0].notnull()] y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index] return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored,\ otu_after_pca_wo_taxonomy, OtuMf, preproccessed_data
def _read_file(self, title, bactria_as_feature_file, samples_data_file, allow_printing, perform_anna_preprocess): features = pd.read_csv(bactria_as_feature_file, header=1) cols = list(features.columns) # remove non-numeric values cols.remove('Feature ID') cols.remove('Taxonomy') OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') if perform_anna_preprocess: preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxonomy_col='Taxonomy', taxnomy_level=6) mapping_file = OtuMf.mapping_file['XXXXX'] mapping_disease = { 'a': 0, 'b': 1, # 'Cashew' + 'Hazelnut' + 'Walnut' 'c': 2, 'd': 3 } mapping_file = mapping_file.map(mapping_disease) preproccessed_data, mapping_file = distance_learning( perform_distance=True, level=self._taxnomy_level, preproccessed_data=preproccessed_data, mapping_file=mapping_file) self._preproccessed_data = preproccessed_data else: preproccessed_data = preprocess_data( OtuMf.otu_file, visualize_data=False, taxnomy_level=self._taxnomy_level, taxonomy_col='Taxonomy', preform_taxnomy_group=True) self._preproccessed_data = preproccessed_data # drow_data(preproccessed_data) # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=n_components, visualize=False) self._pca_obj = pca_obj index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(otu_after_pca_wo_taxonomy.values): id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i] self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map ids_list = otu_after_pca_wo_taxonomy.index.tolist() ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop( otu_after_pca_wo_taxonomy.index[0:62]) if self._task == "health task": self._ids_list = ids_list id_to_tag_map = {} for sample in ids_list: if sample.startswith('Con'): id_to_tag_map[sample] = 1 else: id_to_tag_map[sample] = 0 self._id_to_tag_map = id_to_tag_map if self._task == "prognostic task": treatment_point_column = 'TreatmentPoint' before_treatment_ids = [] for sample in ids_list_wo_con: s = OtuMf.mapping_file.loc[sample, treatment_point_column] if s == "before": before_treatment_ids.append(sample) self._ids_list = list(before_treatment_ids) success_column = 'SuccessDescription' id_to_tag_map = {} for sample in before_treatment_ids: t = OtuMf.mapping_file.loc[sample, success_column] id_to_tag_map[sample] = t if t == 'A1': id_to_tag_map[sample] = 1 else: id_to_tag_map[sample] = 0 self._id_to_tag_map = id_to_tag_map if self._task == "diagnostics task": self._ids_list = list(ids_list_wo_con) success_column = 'SuccessDescription' id_to_tag_map = {} for sample in ids_list_wo_con: t = OtuMf.mapping_file.loc[sample, success_column] id_to_tag_map[sample] = t if t == 'A1': id_to_tag_map[sample] = 1 else: id_to_tag_map[sample] = 0 self._id_to_tag_map = id_to_tag_map # -------------------------------------------- weights !-------------------------------------------- # calculate weights y = list(id_to_tag_map.values()) classes_sum = [ np.sum(np.array(y) == unique_class) for unique_class in np.unique(np.array(y)) ] classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum] weights = [classes_ratio[a] for a in np.array(y)] self._weight_map = { i: classes_ratio[i] for i in range(len(classes_ratio)) } # return the list of features and the list of ids in the same order feature_list = [id_to_features_map[id] for id in ids_list] self._feature_list = feature_list