except: warnings.warn(f'{date_str} is not a valid date, sample will be ignored') date_str = '01/01/1800' return datetime.datetime.strptime(date_str, '%d/%m/%Y') def get_days(days_datetime): return days_datetime.days n_components = 20 use_recorded = False if not use_recorded: OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'), os.path.join(SCRIPT_DIR, 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=5, taxonomy_col='Taxonomy', preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) ######## Pre process (Remove control group) ######## column_to_use_for_filter = 'AllergyTypeData131118' OtuMf.mapping_file = OtuMf.mapping_file.loc[OtuMf.mapping_file['AllergyTypeData131118'] != 'Con'] ######## get date of sample in date format ######## date_of_sample_col = 'Date' OtuMf.mapping_file['Date_of_sample'] = OtuMf.mapping_file[date_of_sample_col].apply(get_datetime, time_format='%m/%d/%y') ######## remove invalid subjects (those who had samples with no dates or bad dates) ######## # bad dates
def _read_file(self, TITLE, PRINT, REG): bactria_as_feature_file = 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv' features = pd.read_csv(bactria_as_feature_file, header=1) cols = list(features.columns) # remove non-numeric values cols.remove('Feature ID') cols.remove('Taxonomy') if REG: self.reg(features, cols) samples_data_file = 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv' OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=6, taxonomy_col='Taxonomy', preform_taxnomy_group=True) self._preproccessed_data = preproccessed_data # drow_data(preproccessed_data) # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=n_components, visualize=False) control = otu_after_pca_wo_taxonomy.index[0:62] # 'Con' self._pca_obj = pca_obj # if we want to remove the healthy samples that are used for control # otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.drop(preproccessed_data.index[0:62]) index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(otu_after_pca_wo_taxonomy.values): id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i] self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map success_tag_column = 'SuccessDescription' stages_column = 'TreatmentTimePoint' allergan_column = 'AllergyType' code_column = 'ParticipentCode' ids_list_w_con = otu_after_pca_wo_taxonomy.index.tolist() ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop( otu_after_pca_wo_taxonomy.index[0:62]) self._ids_list_w_con = ids_list_w_con self._ids_list_wo_con = ids_list_wo_con stages = [] # ##### separate samples by allergic and healthy==>'Con' id_to_binary_health_tag_map = {} for sample in ids_list_w_con: if sample.startswith('Con'): id_to_binary_health_tag_map[sample] = 1 else: id_to_binary_health_tag_map[sample] = 0 self._id_to_binary_health_tag_map = id_to_binary_health_tag_map # ##### separate samples by stage, success of treatment and allergen type id_to_success_tag_map = {} id_to_stage_map = {} id_to_binary_success_tag_map = {} id_to_allergy_type_tag_map = {} id_to_allergy_number_type_tag_map = {} id_to_milk_allergy_tag_map = {} allergan_types = set() tag_to_allergy_type_map = { 0: 'Milk', 1: 'Tree_nut', # 'Cashew' + 'Hazelnut' + 'Walnut' 2: 'Peanut', 3: 'Sesame' } # removed 'Egg' samples allergy_type_to_instances_map = { 'Milk': 0, 'Tree_nut': 0, 'Peanut': 0, 'Sesame': 0 } # 'Non': 9 samples, 'Egg': 35 samples """ nuts_samples_list = [] for sample in ids_list_wo_con: a = OtuMf.mapping_file.loc[sample, allergan_column] if a == 'Nuts': nuts_samples_list.append(sample) with open("nuts_samples.txt", "w") as file: for l in nuts_samples_list: file.write(l + "\n") """ non_allergy_type_ids = [] egg_allergy_type_ids = [] for sample in ids_list_wo_con: s = OtuMf.mapping_file.loc[sample, stages_column] # stages stages.append(s) id_to_stage_map[sample] = s stage_0_ids = [ key for key in id_to_stage_map if id_to_stage_map[key] == '0_before' ] self._stage_0_ids = stage_0_ids # success t = OtuMf.mapping_file.loc[sample, success_tag_column] id_to_success_tag_map[sample] = t # save tags from k-classes as success(A1)->1 and failure(the rest)->0 if t == 'A1': id_to_binary_success_tag_map[sample] = 1 else: id_to_binary_success_tag_map[sample] = 0 # allergy type a = OtuMf.mapping_file.loc[sample, allergan_column] allergan_types.add(a) id_to_allergy_type_tag_map[sample] = a if a == 'Milk' or a == 'Milk_suspected' or a == 'milk': id_to_allergy_number_type_tag_map[sample] = 0 id_to_milk_allergy_tag_map[sample] = 1 allergy_type_to_instances_map[ 'Milk'] = allergy_type_to_instances_map.get('Milk') + 1 elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts': id_to_allergy_number_type_tag_map[sample] = 1 id_to_milk_allergy_tag_map[sample] = 0 allergy_type_to_instances_map[ 'Tree_nut'] = allergy_type_to_instances_map.get( 'Tree_nut') + 1 elif a == 'Peanut': id_to_allergy_number_type_tag_map[sample] = 2 id_to_milk_allergy_tag_map[sample] = 0 allergy_type_to_instances_map[ 'Peanut'] = allergy_type_to_instances_map.get('Peanut') + 1 elif a == 'Sesame': id_to_allergy_number_type_tag_map[sample] = 3 id_to_milk_allergy_tag_map[sample] = 0 allergy_type_to_instances_map[ 'Sesame'] = allergy_type_to_instances_map.get('Sesame') + 1 elif a == 'Egg': egg_allergy_type_ids.append(sample) # id_to_allergy_number_type_tag_map[sample] = 1 # id_to_milk_allergy_tag_map[sample] = 0 # allergy_type_to_instances_map['Egg'] = allergy_type_to_instances_map.get('Egg') + 1 elif a == 'Non': non_allergy_type_ids.append(sample) # id_to_allergy_number_type_tag_map[sample] = None # id_to_milk_allergy_tag_map[sample] = None # allergy_type_to_instances_map['Non'] = allergy_type_to_instances_map.get('Non') + 1 else: print("error in allergy type " + str(sample)) # -------------------------------------------- weights !-------------------------------------------- # calculate weights for types of allergy total_sum = sum(list(allergy_type_to_instances_map.values())) types = list(allergy_type_to_instances_map.keys()) allergy_type_to_weight_map = {} for t in types: allergy_type_to_weight_map[ t] = total_sum / allergy_type_to_instances_map[t] # normalize max_weight = max(list(allergy_type_to_weight_map.values())) for t in types: allergy_type_to_weight_map[t] = allergy_type_to_weight_map.get( t) / max_weight # calculate weights for milk vs. other types of allergy milk_vs_other_allergy_weight_map = { 'Other': total_sum / (total_sum - allergy_type_to_instances_map.get("Milk")), 'Milk': total_sum / allergy_type_to_instances_map.get("Milk") } # normalize max_weight = max(list(milk_vs_other_allergy_weight_map.values())) for t in ['Other', 'Milk']: milk_vs_other_allergy_weight_map[ t] = milk_vs_other_allergy_weight_map.get(t) / max_weight # calculate weights for healthy and allergic healthy_vs_allergic_weight_map = { 'Allergic': (len(ids_list_w_con)) / len(ids_list_wo_con), 'Healthy': (len(ids_list_w_con)) / (len(ids_list_w_con) - len(ids_list_wo_con)) } # normalize max_weight = max(list(healthy_vs_allergic_weight_map.values())) for t in ['Allergic', 'Healthy']: healthy_vs_allergic_weight_map[ t] = healthy_vs_allergic_weight_map.get(t) / max_weight # calculate weights for responding and not (success) no_response = list(id_to_binary_success_tag_map.values()).count(0) yes_response = list(id_to_binary_success_tag_map.values()).count(1) responding_vs_not_weight_map = { 'No': (len(ids_list_wo_con)) / no_response, 'Yes': (len(ids_list_wo_con) / yes_response) } # normalize max_weight = max(list(responding_vs_not_weight_map.values())) for t in ['No', 'Yes']: responding_vs_not_weight_map[t] = responding_vs_not_weight_map.get( t) / max_weight # calculate weights for responding and not (prognostic) tags = [] for i in stage_0_ids: tags.append(id_to_binary_success_tag_map.get(i)) no_response = tags.count(0) yes_response = tags.count(1) prognostic_responding_vs_not_weight_map = { 'No': (len(stage_0_ids)) / no_response, 'Yes': (len(stage_0_ids) / yes_response) } # normalize max_weight = max(list( prognostic_responding_vs_not_weight_map.values())) for t in ['No', 'Yes']: prognostic_responding_vs_not_weight_map[ t] = prognostic_responding_vs_not_weight_map.get( t) / max_weight self._id_wo_non_and_egg_allergy_type_list = [ x for x in self._ids_list_wo_con if x not in non_allergy_type_ids + egg_allergy_type_ids ] self._tag_to_allergy_type_map = tag_to_allergy_type_map self._allergy_type_to_instances_map = allergy_type_to_instances_map self._allergy_type_to_weight_map = allergy_type_to_weight_map self._milk_vs_other_allergy_weight_map = milk_vs_other_allergy_weight_map self._healthy_vs_allergic_weight_map = healthy_vs_allergic_weight_map self._responding_vs_not_weight_map = responding_vs_not_weight_map self._prognostic_responding_vs_not_weight_map = prognostic_responding_vs_not_weight_map self._id_to_success_tag_map = id_to_success_tag_map self._id_to_stage_map = id_to_stage_map self._id_to_binary_success_tag_map = id_to_binary_success_tag_map self._id_to_allergy_type_tag_map = id_to_allergy_type_tag_map self._id_to_allergy_number_type_tag_map = id_to_allergy_number_type_tag_map self._id_to_milk_allergy_tag_map = id_to_milk_allergy_tag_map """ # count tags in all vs. stage_0 all_tags = list(id_to_binary_success_tag_map.values()) print("tags total len: " + str(len(all_tags)) + " pos tags: " + str(all_tags.count(1)) + " percent: " + str(all_tags.count(1)/len(all_tags))) stage_0_tags = [id_to_binary_success_tag_map[id] for id in stage_0_ids if id in id_to_binary_success_tag_map.keys()] print("stage 0 tags total len: " + str(len(stage_0_tags)) + " pos tags: " + str(stage_0_tags.count(1)) + " percent: " + str(stage_0_tags.count(1)/len(stage_0_tags))) """ # return the list of features and the list of ids in the same order feature_list = [id_to_features_map[id] for id in ids_list_w_con] return ids_list_w_con, ids_list_wo_con, feature_list
if pd.isnull(date_str): date_str = '01/01/1900' return datetime.datetime.strptime(date_str, '%d/%m/%Y') def get_days(days_datetime): return days_datetime.days n_components = 20 use_recorded = True if not use_recorded: OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'), os.path.join(SCRIPT_DIR, 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=5) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days']) # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']]) # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days # OtuMf.mapping_file.apply(lambda x: -999 if x['Mucositis_Start'] is None else (datetime.datetime.strptime(x['DATE'], '%d/%m/%Y') - datetime.datetime.strptime(x['Mucositis_Start'], '%d/%m/%Y')).days) OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply(get_datetime) OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file['Mucositis_Start'].apply(get_datetime) OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file['Mocosities_start_datetime'] - OtuMf.mapping_file[
def prepare_data(n_components = 20): OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'), os.path.join(SCRIPT_DIR, 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5, preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=True) ######## Pre process (Remove control group) ######## OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply(get_datetime) OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file['Mucositis_Start'].apply(get_datetime) OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file['Mocosities_start_datetime'] - OtuMf.mapping_file[ 'DATE_datetime'] OtuMf.mapping_file['time_for_the_event'] = OtuMf.mapping_file['TIME_BEFORE_MOCO_START'].apply(get_days) OtuMf.mapping_file['time_for_the_event'][ OtuMf.mapping_file['Mocosities_start_datetime'] == datetime.datetime.strptime('01/01/1900', '%d/%m/%Y')] = 9999 # create groups data_grouped = OtuMf.mapping_file.groupby('Personal_ID') censored_data = {} not_censored = pd.DataFrame() dilated_df = pd.DataFrame() y_for_deep = pd.DataFrame() x_for_deep = pd.DataFrame() x_for_deep_censored = pd.DataFrame() y_for_deep_censored = pd.DataFrame() for subject_id, subject_data in data_grouped: if 9999 in subject_data['time_for_the_event'].values: # censored tmp_data = subject_data.join(otu_after_pca_wo_taxonomy) tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()] if not tmp_data_only_valid.empty: x_for_deep_censored = x_for_deep_censored.append(subject_data) tmp_data_only_valid['time_before_moco_start_days'] = tmp_data_only_valid[ 'TIME_BEFORE_MOCO_START'].apply(get_days) tmp_data_only_valid.sort_index(by='time_before_moco_start_days', ascending=False, inplace=True) tmp_data_only_valid['relative_start_date'] = tmp_data_only_valid['time_before_moco_start_days'].iloc[ 0] - tmp_data_only_valid[ 'time_before_moco_start_days'] tmp_data_only_valid['relative_max_date'] = tmp_data_only_valid['relative_start_date'][-1] - \ tmp_data_only_valid['relative_start_date'] tmp_data_only_valid['delta_time'] = -1 tmp_data_only_valid['mse_coeff'] = 0 tmp_data_only_valid['time_sense_coeff'] = 1 y_for_deep_censored = y_for_deep_censored.append( tmp_data_only_valid[['relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff']]) # get only the last sample censored_data[subject_id] = tmp_data_only_valid.loc[ tmp_data_only_valid['TIME_BEFORE_MOCO_START'] == min(tmp_data_only_valid['TIME_BEFORE_MOCO_START'])] else: # not censored before_event_mask = subject_data['time_for_the_event'] > 0 before_event_subjects = subject_data.loc[before_event_mask] if not before_event_subjects.empty: not_censored = not_censored.append(before_event_subjects) dilated_df = dilated_df.append(before_event_subjects) x_for_deep = x_for_deep.append(before_event_subjects) before_event_subjects['time_before_moco_start_days'] = before_event_subjects[ 'TIME_BEFORE_MOCO_START'].apply(get_days) before_event_subjects.sort_index(by='time_before_moco_start_days', ascending=False, inplace=True) before_event_subjects['relative_start_date'] = before_event_subjects['time_before_moco_start_days'].iloc[ 0] - before_event_subjects['time_before_moco_start_days'] before_event_subjects['relative_max_date'] = before_event_subjects['relative_start_date'] + \ before_event_subjects['time_before_moco_start_days'] before_event_subjects['delta_time'] = before_event_subjects['time_for_the_event'] before_event_subjects['mse_coeff'] = 1 before_event_subjects['time_sense_coeff'] = 0 y_for_deep = y_for_deep.append( before_event_subjects[['relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff']]) x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy) x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()] y_for_deep = y_for_deep.loc[x_for_deep.index] x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy) x_for_deep_censored = x_for_deep_censored.loc[x_for_deep_censored[0].notnull()] y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index] return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf
def get_days(days_datetime): return days_datetime.days n_components = 10 use_recorded = False script_dir = sys.path[0] if not use_recorded: OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'ronies_Data', 'saliva_samples_231018.csv'), os.path.join( SCRIPT_DIR, 'ronies_Data', 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=6) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days']) # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']]) # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days
def prepare_data(n_components=20): OtuMf = OtuMfHandler( os.path.join( SCRIPT_DIR, 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'), os.path.join( SCRIPT_DIR, 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv' ), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=5, taxonomy_col='Taxonomy', preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) ######## Pre process (Remove control group) ######## column_to_use_for_filter = 'AllergyTypeData131118' OtuMf.mapping_file = OtuMf.mapping_file.loc[ OtuMf.mapping_file['AllergyTypeData131118'] != 'Con'] ######## get date of sample in date format ######## date_of_sample_col = 'Date' OtuMf.mapping_file['Date_of_sample'] = OtuMf.mapping_file[ date_of_sample_col].apply(get_datetime, time_format='%m/%d/%y') ######## remove invalid subjects (those who had samples with no dates or bad dates) ######## # bad dates tmp = OtuMf.mapping_file.loc[OtuMf.mapping_file['Date_of_sample'].isin( ['1800-01-01', '1900-01-01'])] patients_with_bad_date = tmp['PatientNumber210119'].unique() # remove bad dates OtuMf.mapping_file = OtuMf.mapping_file.loc[ ~OtuMf.mapping_file['PatientNumber210119'].isin(patients_with_bad_date )] ######## Calculate time for event ######## OtuMf.mapping_file['time_for_the_event'] = 9999 col_to_group_by = 'PatientNumber210119' data_grouped = OtuMf.mapping_file.groupby(col_to_group_by) for subject_id, subject_data in data_grouped: if any(subject_data['SuccessDescription'] == 'A1'): # Uncensored date_of_event = subject_data['Date_of_sample'].max() time_for_the_event = date_of_event - subject_data['Date_of_sample'] tmp_df = OtuMf.mapping_file.loc[subject_data.index] tmp_df['time_for_the_event'] = time_for_the_event.apply(get_days) OtuMf.mapping_file.update(tmp_df) else: # Censored pass ######## Filter alergies ######## # allergy types ['Sesame', 'Peanut', 'Egg', 'Non', 'Walnut', 'Milk', 'Cashew', 'Hazelnut'] # OtuMf.mapping_file['AllergyTypeData131118'].value_counts() # Peanut 134 # Milk 112 # Sesame 80 # Walnut 72 # Egg 28 # Cashew 18 # Hazelnut 9 # Non 9 allergy_to_use = ['Peanut'] OtuMf.mapping_file = OtuMf.mapping_file[ OtuMf.mapping_file['AllergyTypeData131118'].isin(allergy_to_use)] ######## Create inputs ######## # create groups col_to_group_by = 'PatientNumber210119' data_grouped = OtuMf.mapping_file.groupby(col_to_group_by) censored_data = {} not_censored = pd.DataFrame() y_for_deep = pd.DataFrame() x_for_deep = pd.DataFrame() x_for_deep_censored = pd.DataFrame() y_for_deep_censored = pd.DataFrame() def calculate_y_for_deep_per_row(row): a = row.sort_values() return a.index[0] for subject_id, subject_data in data_grouped: if 9999 in subject_data['time_for_the_event'].values: # censored tmp_data = subject_data.join(otu_after_pca_wo_taxonomy) tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()] if not tmp_data_only_valid.empty: x_for_deep_censored = x_for_deep_censored.append(subject_data) tmp_data_only_valid.sort_index(by='Date_of_sample', ascending=True, inplace=True) tmp_data_only_valid['relative_start_date'] = ( tmp_data_only_valid['Date_of_sample'] - tmp_data_only_valid['Date_of_sample'].iloc[0] ).apply(get_days) tmp_data_only_valid['relative_max_date'] = ( tmp_data_only_valid['Date_of_sample'].iloc[-1] - tmp_data_only_valid['Date_of_sample']).apply(get_days) tmp_data_only_valid['delta_time'] = -1 tmp_data_only_valid['mse_coeff'] = 0 tmp_data_only_valid['time_sense_coeff'] = 1 y_for_deep_censored = y_for_deep_censored.append( tmp_data_only_valid[[ 'relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff' ]]) # get only the last sample censored_data[subject_id] = tmp_data_only_valid.loc[ tmp_data_only_valid['relative_max_date'] == min( tmp_data_only_valid['relative_max_date'])] else: # not censored before_event_mask = subject_data['time_for_the_event'] > 0 before_event_subjects = subject_data.loc[before_event_mask] if not before_event_subjects.empty: not_censored = not_censored.append(before_event_subjects) x_for_deep = x_for_deep.append(before_event_subjects) before_event_subjects.sort_index(by='time_for_the_event', ascending=False, inplace=True) before_event_subjects[ 'relative_start_date'] = before_event_subjects[ 'time_for_the_event'].iloc[0] - before_event_subjects[ 'time_for_the_event'] before_event_subjects[ 'relative_max_date'] = before_event_subjects[ 'time_for_the_event'] before_event_subjects['delta_time'] = before_event_subjects[ 'time_for_the_event'] before_event_subjects['mse_coeff'] = 1 before_event_subjects['time_sense_coeff'] = 0 y_for_deep = y_for_deep.append(before_event_subjects[[ 'relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff' ]]) x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy) x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()] y_for_deep = y_for_deep.loc[x_for_deep.index] x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy) x_for_deep_censored = x_for_deep_censored.loc[ x_for_deep_censored[0].notnull()] y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index] return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf
from sklearn import svm # from sklearn.svm import SV from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.metrics import mean_squared_error from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.cross_validation import train_test_split import xgboost as xgb import datetime from gvhd.show_data import calc_results from gvhd.calculate_distances import calculate_distance from gvhd.cluster_time_events import cluster_based_on_time n_components = 20 OtuMf = OtuMfHandler('otu_IBD_table.csv', 'metadata_ok94_ok59.csv', from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5, preform_taxnomy_group=False) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days']) # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']]) # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days
, x_label='Bacteria', y1_data=means, y1_color='#539caf', y1_label='Means', y2_data=normal_means, y2_color='#7663b0', y2_label='Normalized means', title='before_and_after_z_score_per_person_bar_plot') # samples_data_file = 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv' OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=5, taxonomy_col='Taxonomy', preform_taxnomy_group=True) otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False) # Remove control group == 'Con' tag_column = 'SuccessDescription' X = otu_after_pca_wo_taxonomy.index.tolist()
plt.title(r'$1-\rho$ vs params') plt.xlabel('sample #') plt.ylabel(r'$1-\rho$ value') def predict_get_spearman_value(test_set, regressor): test_df = pd.DataFrame(test_set['age_in_days']) test_df['predicted'] = regressor.predict( test_set.loc[:, test_set.columns != 'age_in_days']) spearman_values = use_spearmanr(test_set['age_in_days'].values, test_df['predicted'].values) return test_df, spearman_values if __name__ == "__main__": OtuMf = OtuMfHandler('aging_otu_table.csv', 'mf.csv', from_QIIME=True) preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5) otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=80) # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy) # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy) merged_data_with_age = otu_after_pca_wo_taxonomy.join( OtuMf.mapping_file['age_in_days']) merged_data_with_age = merged_data_with_age[ merged_data_with_age.age_in_days.notnull()] # remove NaN days # create train set and test set merged_data_with_age = merged_data_with_age.sample(frac=1) train_size = math.ceil(merged_data_with_age.shape[0] * 0.85)
def _read_file(self, title, bactria_as_feature_file, samples_data_file, allow_printing, perform_anna_preprocess): features = pd.read_csv(bactria_as_feature_file, header=1) cols = list(features.columns) # remove non-numeric values cols.remove('Feature ID') cols.remove('Taxonomy') OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file), os.path.join(SCRIPT_DIR, samples_data_file), from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy') preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=self._taxnomy_level, taxonomy_col='Taxonomy', preform_taxnomy_group=True) self._preproccessed_data = preproccessed_data # drow_data(preproccessed_data) # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False) otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca( preproccessed_data, n_components=n_components, visualize=False) self._pca_obj = pca_obj index_to_id_map = {} id_to_features_map = {} for i, row in enumerate(otu_after_pca_wo_taxonomy.values): id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i] self._index_to_id_map = index_to_id_map self._id_to_features_map = id_to_features_map ids_list = otu_after_pca_wo_taxonomy.index.tolist() self._ids_list = ids_list XXXXX_column = 'SuccessDescription' id_to_tag_map = {} for sample in ids_list: t = OtuMf.mapping_file.loc[sample, XXXXX_column] id_to_tag_map[sample] = t if t == 'A1': id_to_tag_map[sample] = 1 else: id_to_tag_map[sample] = 0 self._id_to_tag_map = id_to_tag_map # -------------------------------------------- weights !-------------------------------------------- # calculate weights y = list(id_to_tag_map.values()) classes_sum = [ np.sum(np.array(y) == unique_class) for unique_class in np.unique(np.array(y)) ] classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum] weights = [classes_ratio[a] for a in np.array(y)] self._weight_map = classes_ratio # return the list of features and the list of ids in the same order feature_list = [id_to_features_map[id] for id in ids_list] self._feature_list = feature_list
def get_days(days_datetime): return days_datetime.days n_components = 20 taxnomy_level = 3 n = n_components file_name = f'combined_only_report_n_comps_{n_components}_taxonomy_level_{taxnomy_level}_using_ronies_and_{n}_pca' use_recorded = False script_dir = sys.path[0] if not use_recorded: OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'ronies_Data', 'saliva_samples_231018.csv'), os.path.join( SCRIPT_DIR, 'ronies_Data', 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True) OtuMf = OtuMfHandler( os.path.join(SCRIPT_DIR, 'learn dataset', 'mucositis_first_table_260219.csv'), os.path.join(SCRIPT_DIR, 'learn dataset', 'mucositis_mapping_file_first_250219_numbers.csv'), from_QIIME=True) OtuMf = OtuMfHandler( os.path.join(SCRIPT_DIR, 'combined_data', 'dataset', 'mucositis_combine_table_260219.csv'), os.path.join(SCRIPT_DIR, 'combined_data', 'dataset', 'mucositis_mapping_file_combine_250219_numbers.csv'),