Python preprocess_data Examples

Programming Language: Python

Namespace/Package Name: infra_functions.preprocess

Method/Function: preprocess_data

Examples at hotexamples.com: 8

Python preprocess_data - 8 examples found. These are the top rated real world Python examples of infra_functions.preprocess.preprocess_data extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def _read_file(self, TITLE, PRINT, REG):
        bactria_as_feature_file = 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'

        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        if REG:
            self.reg(features, cols)

        samples_data_file = 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'
        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        preproccessed_data = preprocess_data(OtuMf.otu_file,
                                             visualize_data=False,
                                             taxnomy_level=6,
                                             taxonomy_col='Taxonomy',
                                             preform_taxnomy_group=True)

        self._preproccessed_data = preproccessed_data
        # drow_data(preproccessed_data)
        # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        control = otu_after_pca_wo_taxonomy.index[0:62]  # 'Con'
        self._pca_obj = pca_obj
        # if we want to remove the healthy samples that are used for control
        # otu_after_pca_wo_taxonomy = otu_after_pca_wo_taxonomy.drop(preproccessed_data.index[0:62])

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map

        success_tag_column = 'SuccessDescription'
        stages_column = 'TreatmentTimePoint'
        allergan_column = 'AllergyType'
        code_column = 'ParticipentCode'
        ids_list_w_con = otu_after_pca_wo_taxonomy.index.tolist()
        ids_list_wo_con = otu_after_pca_wo_taxonomy.index.drop(
            otu_after_pca_wo_taxonomy.index[0:62])

        self._ids_list_w_con = ids_list_w_con
        self._ids_list_wo_con = ids_list_wo_con

        stages = []

        # ##### separate samples by allergic and healthy==>'Con'
        id_to_binary_health_tag_map = {}
        for sample in ids_list_w_con:
            if sample.startswith('Con'):
                id_to_binary_health_tag_map[sample] = 1
            else:
                id_to_binary_health_tag_map[sample] = 0

        self._id_to_binary_health_tag_map = id_to_binary_health_tag_map

        # ##### separate samples by stage, success of treatment and allergen type
        id_to_success_tag_map = {}
        id_to_stage_map = {}
        id_to_binary_success_tag_map = {}
        id_to_allergy_type_tag_map = {}
        id_to_allergy_number_type_tag_map = {}
        id_to_milk_allergy_tag_map = {}
        allergan_types = set()

        tag_to_allergy_type_map = {
            0: 'Milk',
            1: 'Tree_nut',  # 'Cashew' + 'Hazelnut' + 'Walnut'
            2: 'Peanut',
            3: 'Sesame'
        }  # removed 'Egg' samples

        allergy_type_to_instances_map = {
            'Milk': 0,
            'Tree_nut': 0,
            'Peanut': 0,
            'Sesame': 0
        }  # 'Non': 9 samples, 'Egg': 35 samples
        """
        nuts_samples_list = []
        for sample in ids_list_wo_con:
             a = OtuMf.mapping_file.loc[sample, allergan_column]
             if a == 'Nuts':
                nuts_samples_list.append(sample)
        with open("nuts_samples.txt", "w") as file:
            for l in nuts_samples_list:
                 file.write(l + "\n")
    """
        non_allergy_type_ids = []
        egg_allergy_type_ids = []
        for sample in ids_list_wo_con:
            s = OtuMf.mapping_file.loc[sample, stages_column]
            # stages
            stages.append(s)
            id_to_stage_map[sample] = s
            stage_0_ids = [
                key for key in id_to_stage_map
                if id_to_stage_map[key] == '0_before'
            ]
            self._stage_0_ids = stage_0_ids

            # success
            t = OtuMf.mapping_file.loc[sample, success_tag_column]
            id_to_success_tag_map[sample] = t
            # save tags from k-classes as success(A1)->1 and failure(the rest)->0
            if t == 'A1':
                id_to_binary_success_tag_map[sample] = 1
            else:
                id_to_binary_success_tag_map[sample] = 0

            # allergy type
            a = OtuMf.mapping_file.loc[sample, allergan_column]
            allergan_types.add(a)
            id_to_allergy_type_tag_map[sample] = a

            if a == 'Milk' or a == 'Milk_suspected' or a == 'milk':
                id_to_allergy_number_type_tag_map[sample] = 0
                id_to_milk_allergy_tag_map[sample] = 1
                allergy_type_to_instances_map[
                    'Milk'] = allergy_type_to_instances_map.get('Milk') + 1
            elif a == 'Cashew' or a == 'Cashew ' or a == 'Hazelnut' or a == 'Walnut' or a == 'Nuts':
                id_to_allergy_number_type_tag_map[sample] = 1
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Tree_nut'] = allergy_type_to_instances_map.get(
                        'Tree_nut') + 1
            elif a == 'Peanut':
                id_to_allergy_number_type_tag_map[sample] = 2
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Peanut'] = allergy_type_to_instances_map.get('Peanut') + 1
            elif a == 'Sesame':
                id_to_allergy_number_type_tag_map[sample] = 3
                id_to_milk_allergy_tag_map[sample] = 0
                allergy_type_to_instances_map[
                    'Sesame'] = allergy_type_to_instances_map.get('Sesame') + 1
            elif a == 'Egg':
                egg_allergy_type_ids.append(sample)
                # id_to_allergy_number_type_tag_map[sample] = 1
                # id_to_milk_allergy_tag_map[sample] = 0
                # allergy_type_to_instances_map['Egg'] = allergy_type_to_instances_map.get('Egg') + 1
            elif a == 'Non':
                non_allergy_type_ids.append(sample)
                # id_to_allergy_number_type_tag_map[sample] = None
                # id_to_milk_allergy_tag_map[sample] = None
                # allergy_type_to_instances_map['Non'] = allergy_type_to_instances_map.get('Non') + 1
            else:
                print("error in allergy type " + str(sample))

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights for types of allergy
        total_sum = sum(list(allergy_type_to_instances_map.values()))
        types = list(allergy_type_to_instances_map.keys())
        allergy_type_to_weight_map = {}
        for t in types:
            allergy_type_to_weight_map[
                t] = total_sum / allergy_type_to_instances_map[t]

        # normalize
        max_weight = max(list(allergy_type_to_weight_map.values()))
        for t in types:
            allergy_type_to_weight_map[t] = allergy_type_to_weight_map.get(
                t) / max_weight

        # calculate weights for milk vs. other types of allergy
        milk_vs_other_allergy_weight_map = {
            'Other':
            total_sum /
            (total_sum - allergy_type_to_instances_map.get("Milk")),
            'Milk':
            total_sum / allergy_type_to_instances_map.get("Milk")
        }
        # normalize
        max_weight = max(list(milk_vs_other_allergy_weight_map.values()))
        for t in ['Other', 'Milk']:
            milk_vs_other_allergy_weight_map[
                t] = milk_vs_other_allergy_weight_map.get(t) / max_weight

        # calculate weights for healthy and allergic
        healthy_vs_allergic_weight_map = {
            'Allergic': (len(ids_list_w_con)) / len(ids_list_wo_con),
            'Healthy': (len(ids_list_w_con)) /
            (len(ids_list_w_con) - len(ids_list_wo_con))
        }

        # normalize
        max_weight = max(list(healthy_vs_allergic_weight_map.values()))
        for t in ['Allergic', 'Healthy']:
            healthy_vs_allergic_weight_map[
                t] = healthy_vs_allergic_weight_map.get(t) / max_weight

        # calculate weights for responding and not (success)
        no_response = list(id_to_binary_success_tag_map.values()).count(0)
        yes_response = list(id_to_binary_success_tag_map.values()).count(1)

        responding_vs_not_weight_map = {
            'No': (len(ids_list_wo_con)) / no_response,
            'Yes': (len(ids_list_wo_con) / yes_response)
        }

        # normalize
        max_weight = max(list(responding_vs_not_weight_map.values()))
        for t in ['No', 'Yes']:
            responding_vs_not_weight_map[t] = responding_vs_not_weight_map.get(
                t) / max_weight

        # calculate weights for responding and not (prognostic)
        tags = []
        for i in stage_0_ids:
            tags.append(id_to_binary_success_tag_map.get(i))

        no_response = tags.count(0)
        yes_response = tags.count(1)

        prognostic_responding_vs_not_weight_map = {
            'No': (len(stage_0_ids)) / no_response,
            'Yes': (len(stage_0_ids) / yes_response)
        }

        # normalize
        max_weight = max(list(
            prognostic_responding_vs_not_weight_map.values()))
        for t in ['No', 'Yes']:
            prognostic_responding_vs_not_weight_map[
                t] = prognostic_responding_vs_not_weight_map.get(
                    t) / max_weight

        self._id_wo_non_and_egg_allergy_type_list = [
            x for x in self._ids_list_wo_con
            if x not in non_allergy_type_ids + egg_allergy_type_ids
        ]
        self._tag_to_allergy_type_map = tag_to_allergy_type_map
        self._allergy_type_to_instances_map = allergy_type_to_instances_map
        self._allergy_type_to_weight_map = allergy_type_to_weight_map
        self._milk_vs_other_allergy_weight_map = milk_vs_other_allergy_weight_map
        self._healthy_vs_allergic_weight_map = healthy_vs_allergic_weight_map
        self._responding_vs_not_weight_map = responding_vs_not_weight_map
        self._prognostic_responding_vs_not_weight_map = prognostic_responding_vs_not_weight_map
        self._id_to_success_tag_map = id_to_success_tag_map
        self._id_to_stage_map = id_to_stage_map
        self._id_to_binary_success_tag_map = id_to_binary_success_tag_map
        self._id_to_allergy_type_tag_map = id_to_allergy_type_tag_map
        self._id_to_allergy_number_type_tag_map = id_to_allergy_number_type_tag_map
        self._id_to_milk_allergy_tag_map = id_to_milk_allergy_tag_map
        """    # count tags in all vs. stage_0
        all_tags = list(id_to_binary_success_tag_map.values())
        print("tags total len: " + str(len(all_tags)) + " pos tags: " + str(all_tags.count(1))
              + " percent: " + str(all_tags.count(1)/len(all_tags)))
        stage_0_tags = [id_to_binary_success_tag_map[id] for id in stage_0_ids if id in id_to_binary_success_tag_map.keys()]
        print("stage 0 tags total len: " + str(len(stage_0_tags)) + " pos tags: " + str(stage_0_tags.count(1))
              + " percent: " + str(stage_0_tags.count(1)/len(stage_0_tags)))
        """

        # return the list of features and the list of ids in the same order
        feature_list = [id_to_features_map[id] for id in ids_list_w_con]
        return ids_list_w_con, ids_list_wo_con, feature_list

Example #2

Show file

File: prepare_data.py Project: dafnamagid/microbiome

def prepare_data(n_components = 20):
    OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'),
                         os.path.join(SCRIPT_DIR,
                                      'saliva_samples_mapping_file_231018.csv'), from_QIIME=True)
    preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=True, taxnomy_level=5,
                                         preform_taxnomy_group=True)
    otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=True)

    ######## Pre process (Remove control group) ########
    OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply(get_datetime)
    OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file['Mucositis_Start'].apply(get_datetime)
    OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file['Mocosities_start_datetime'] - OtuMf.mapping_file[
        'DATE_datetime']

    OtuMf.mapping_file['time_for_the_event'] = OtuMf.mapping_file['TIME_BEFORE_MOCO_START'].apply(get_days)

    OtuMf.mapping_file['time_for_the_event'][
        OtuMf.mapping_file['Mocosities_start_datetime'] == datetime.datetime.strptime('01/01/1900', '%d/%m/%Y')] = 9999
    # create groups
    data_grouped = OtuMf.mapping_file.groupby('Personal_ID')
    censored_data = {}
    not_censored = pd.DataFrame()
    dilated_df = pd.DataFrame()
    y_for_deep = pd.DataFrame()
    x_for_deep = pd.DataFrame()
    x_for_deep_censored = pd.DataFrame()
    y_for_deep_censored = pd.DataFrame()

    for subject_id, subject_data in data_grouped:
        if 9999 in subject_data['time_for_the_event'].values:  # censored
            tmp_data = subject_data.join(otu_after_pca_wo_taxonomy)
            tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()]
            if not tmp_data_only_valid.empty:
                x_for_deep_censored = x_for_deep_censored.append(subject_data)

                tmp_data_only_valid['time_before_moco_start_days'] = tmp_data_only_valid[
                    'TIME_BEFORE_MOCO_START'].apply(get_days)
                tmp_data_only_valid.sort_index(by='time_before_moco_start_days', ascending=False, inplace=True)
                tmp_data_only_valid['relative_start_date'] = tmp_data_only_valid['time_before_moco_start_days'].iloc[
                                                                 0] - tmp_data_only_valid[
                                                                 'time_before_moco_start_days']
                tmp_data_only_valid['relative_max_date'] = tmp_data_only_valid['relative_start_date'][-1] - \
                                                           tmp_data_only_valid['relative_start_date']
                tmp_data_only_valid['delta_time'] = -1
                tmp_data_only_valid['mse_coeff'] = 0
                tmp_data_only_valid['time_sense_coeff'] = 1
                y_for_deep_censored = y_for_deep_censored.append(
                    tmp_data_only_valid[['relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff']])

                # get only the last sample
                censored_data[subject_id] = tmp_data_only_valid.loc[
                    tmp_data_only_valid['TIME_BEFORE_MOCO_START'] == min(tmp_data_only_valid['TIME_BEFORE_MOCO_START'])]

        else:  # not censored
            before_event_mask = subject_data['time_for_the_event'] > 0
            before_event_subjects = subject_data.loc[before_event_mask]
            if not before_event_subjects.empty:
                not_censored = not_censored.append(before_event_subjects)
                dilated_df = dilated_df.append(before_event_subjects)

                x_for_deep = x_for_deep.append(before_event_subjects)
                before_event_subjects['time_before_moco_start_days'] = before_event_subjects[
                    'TIME_BEFORE_MOCO_START'].apply(get_days)
                before_event_subjects.sort_index(by='time_before_moco_start_days', ascending=False, inplace=True)
                before_event_subjects['relative_start_date'] = before_event_subjects['time_before_moco_start_days'].iloc[
                                                                   0] - before_event_subjects['time_before_moco_start_days']
                before_event_subjects['relative_max_date'] = before_event_subjects['relative_start_date'] + \
                                                             before_event_subjects['time_before_moco_start_days']
                before_event_subjects['delta_time'] = before_event_subjects['time_for_the_event']
                before_event_subjects['mse_coeff'] = 1
                before_event_subjects['time_sense_coeff'] = 0
                y_for_deep = y_for_deep.append(
                    before_event_subjects[['relative_start_date', 'delta_time', 'relative_max_date', 'mse_coeff', 'time_sense_coeff']])

    x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy)
    x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()]
    y_for_deep = y_for_deep.loc[x_for_deep.index]

    x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy)
    x_for_deep_censored = x_for_deep_censored.loc[x_for_deep_censored[0].notnull()]
    y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index]

    return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf

Example #3

Show file

                date_str = '01/01/1800'
                return datetime.datetime.strptime(date_str, '%d/%m/%Y')


def get_days(days_datetime):
    return days_datetime.days


n_components = 20
use_recorded = False

if not use_recorded:
    OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'),
                         os.path.join(SCRIPT_DIR, 'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'),
                         from_QIIME=True, id_col='Feature ID', taxonomy_col='Taxonomy')
    preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=5, taxonomy_col='Taxonomy',
                                         preform_taxnomy_group=True)
    otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False)

    ######## Pre process (Remove control group) ########
    column_to_use_for_filter = 'AllergyTypeData131118'
    OtuMf.mapping_file = OtuMf.mapping_file.loc[OtuMf.mapping_file['AllergyTypeData131118'] != 'Con']

    ######## get date of sample in date format ########
    date_of_sample_col = 'Date'
    OtuMf.mapping_file['Date_of_sample'] = OtuMf.mapping_file[date_of_sample_col].apply(get_datetime, time_format='%m/%d/%y')

    ######## remove invalid subjects (those who had samples with no dates or bad dates) ########
    # bad dates
    tmp = OtuMf.mapping_file.loc[OtuMf.mapping_file['Date_of_sample'].isin(['1800-01-01', '1900-01-01'])]
    patients_with_bad_date = tmp['PatientNumber210119'].unique()
    # remove bad dates

Example #4

Show file

File: analysis_using_tf_my_loss.py Project: dafnamagid/microbiome

    return datetime.datetime.strptime(date_str, '%d/%m/%Y')


def get_days(days_datetime):
    return days_datetime.days


n_components = 20

use_recorded = True

if not use_recorded:

    OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, 'saliva_samples_231018.csv'),
                         os.path.join(SCRIPT_DIR, 'saliva_samples_mapping_file_231018.csv'), from_QIIME=True)
    preproccessed_data = preprocess_data(OtuMf.otu_file, visualize_data=False, taxnomy_level=5)
    otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data, n_components=n_components, visualize=False)
    # otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy)
    # merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy)
    # merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days'])
    # merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days
    # merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']])
    # merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days

    # OtuMf.mapping_file.apply(lambda x: -999 if x['Mucositis_Start'] is None else (datetime.datetime.strptime(x['DATE'], '%d/%m/%Y') - datetime.datetime.strptime(x['Mucositis_Start'], '%d/%m/%Y')).days)


    OtuMf.mapping_file['DATE_datetime'] = OtuMf.mapping_file['DATE'].apply(get_datetime)
    OtuMf.mapping_file['Mocosities_start_datetime'] = OtuMf.mapping_file['Mucositis_Start'].apply(get_datetime)
    OtuMf.mapping_file['TIME_BEFORE_MOCO_START'] = OtuMf.mapping_file['Mocosities_start_datetime'] - OtuMf.mapping_file[
        'DATE_datetime']

Example #5

Show file

def prepare_data(n_components=20):
    OtuMf = OtuMfHandler(
        os.path.join(
            SCRIPT_DIR,
            'feature-table_Allergy_cleaned_taxa_290119_updated_in_140219.csv'),
        os.path.join(
            SCRIPT_DIR,
            'mf_merge_ok84_ok93_ok66_69_merged_by_RestoredSampleCode_as_ID_290119.csv'
        ),
        from_QIIME=True,
        id_col='Feature ID',
        taxonomy_col='Taxonomy')
    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=5,
                                         taxonomy_col='Taxonomy',
                                         preform_taxnomy_group=True)
    otu_after_pca_wo_taxonomy, _, _ = apply_pca(preproccessed_data,
                                                n_components=n_components,
                                                visualize=False)

    ######## Pre process (Remove control group) ########
    column_to_use_for_filter = 'AllergyTypeData131118'
    OtuMf.mapping_file = OtuMf.mapping_file.loc[
        OtuMf.mapping_file['AllergyTypeData131118'] != 'Con']

    ######## get date of sample in date format ########
    date_of_sample_col = 'Date'
    OtuMf.mapping_file['Date_of_sample'] = OtuMf.mapping_file[
        date_of_sample_col].apply(get_datetime, time_format='%m/%d/%y')

    ######## remove invalid subjects (those who had samples with no dates or bad dates) ########
    # bad dates
    tmp = OtuMf.mapping_file.loc[OtuMf.mapping_file['Date_of_sample'].isin(
        ['1800-01-01', '1900-01-01'])]
    patients_with_bad_date = tmp['PatientNumber210119'].unique()
    # remove bad dates
    OtuMf.mapping_file = OtuMf.mapping_file.loc[
        ~OtuMf.mapping_file['PatientNumber210119'].isin(patients_with_bad_date
                                                        )]

    ######## Calculate time for event ########
    OtuMf.mapping_file['time_for_the_event'] = 9999
    col_to_group_by = 'PatientNumber210119'
    data_grouped = OtuMf.mapping_file.groupby(col_to_group_by)

    for subject_id, subject_data in data_grouped:
        if any(subject_data['SuccessDescription'] == 'A1'):  # Uncensored
            date_of_event = subject_data['Date_of_sample'].max()
            time_for_the_event = date_of_event - subject_data['Date_of_sample']
            tmp_df = OtuMf.mapping_file.loc[subject_data.index]
            tmp_df['time_for_the_event'] = time_for_the_event.apply(get_days)
            OtuMf.mapping_file.update(tmp_df)
        else:  # Censored
            pass

    ######## Filter alergies ########
    # allergy types ['Sesame', 'Peanut', 'Egg', 'Non', 'Walnut', 'Milk', 'Cashew', 'Hazelnut']
    # OtuMf.mapping_file['AllergyTypeData131118'].value_counts()
    # Peanut    134
    # Milk    112
    # Sesame    80
    # Walnut    72
    # Egg    28
    # Cashew    18
    # Hazelnut    9
    # Non    9
    allergy_to_use = ['Peanut']
    OtuMf.mapping_file = OtuMf.mapping_file[
        OtuMf.mapping_file['AllergyTypeData131118'].isin(allergy_to_use)]

    ######## Create inputs ########

    # create groups
    col_to_group_by = 'PatientNumber210119'
    data_grouped = OtuMf.mapping_file.groupby(col_to_group_by)
    censored_data = {}
    not_censored = pd.DataFrame()
    y_for_deep = pd.DataFrame()
    x_for_deep = pd.DataFrame()
    x_for_deep_censored = pd.DataFrame()
    y_for_deep_censored = pd.DataFrame()

    def calculate_y_for_deep_per_row(row):
        a = row.sort_values()
        return a.index[0]

    for subject_id, subject_data in data_grouped:
        if 9999 in subject_data['time_for_the_event'].values:  # censored
            tmp_data = subject_data.join(otu_after_pca_wo_taxonomy)
            tmp_data_only_valid = tmp_data.loc[tmp_data[0].notnull()]
            if not tmp_data_only_valid.empty:
                x_for_deep_censored = x_for_deep_censored.append(subject_data)

                tmp_data_only_valid.sort_index(by='Date_of_sample',
                                               ascending=True,
                                               inplace=True)
                tmp_data_only_valid['relative_start_date'] = (
                    tmp_data_only_valid['Date_of_sample'] -
                    tmp_data_only_valid['Date_of_sample'].iloc[0]
                ).apply(get_days)
                tmp_data_only_valid['relative_max_date'] = (
                    tmp_data_only_valid['Date_of_sample'].iloc[-1] -
                    tmp_data_only_valid['Date_of_sample']).apply(get_days)
                tmp_data_only_valid['delta_time'] = -1
                tmp_data_only_valid['mse_coeff'] = 0
                tmp_data_only_valid['time_sense_coeff'] = 1
                y_for_deep_censored = y_for_deep_censored.append(
                    tmp_data_only_valid[[
                        'relative_start_date', 'delta_time',
                        'relative_max_date', 'mse_coeff', 'time_sense_coeff'
                    ]])

                # get only the last sample
                censored_data[subject_id] = tmp_data_only_valid.loc[
                    tmp_data_only_valid['relative_max_date'] == min(
                        tmp_data_only_valid['relative_max_date'])]

        else:  # not censored
            before_event_mask = subject_data['time_for_the_event'] > 0
            before_event_subjects = subject_data.loc[before_event_mask]
            if not before_event_subjects.empty:
                not_censored = not_censored.append(before_event_subjects)

                x_for_deep = x_for_deep.append(before_event_subjects)
                before_event_subjects.sort_index(by='time_for_the_event',
                                                 ascending=False,
                                                 inplace=True)
                before_event_subjects[
                    'relative_start_date'] = before_event_subjects[
                        'time_for_the_event'].iloc[0] - before_event_subjects[
                            'time_for_the_event']
                before_event_subjects[
                    'relative_max_date'] = before_event_subjects[
                        'time_for_the_event']
                before_event_subjects['delta_time'] = before_event_subjects[
                    'time_for_the_event']
                before_event_subjects['mse_coeff'] = 1
                before_event_subjects['time_sense_coeff'] = 0
                y_for_deep = y_for_deep.append(before_event_subjects[[
                    'relative_start_date', 'delta_time', 'relative_max_date',
                    'mse_coeff', 'time_sense_coeff'
                ]])

    x_for_deep = x_for_deep.join(otu_after_pca_wo_taxonomy)
    x_for_deep = x_for_deep.loc[x_for_deep[0].notnull()]
    y_for_deep = y_for_deep.loc[x_for_deep.index]

    x_for_deep_censored = x_for_deep_censored.join(otu_after_pca_wo_taxonomy)
    x_for_deep_censored = x_for_deep_censored.loc[
        x_for_deep_censored[0].notnull()]
    y_for_deep_censored = y_for_deep_censored.loc[x_for_deep_censored.index]

    return x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf

Example #6

Show file

File: analysis.py Project: dafnamagid/microbiome

from sklearn import svm
# from sklearn.svm import SV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import datetime
from gvhd.show_data import calc_results
from gvhd.calculate_distances import calculate_distance
from gvhd.cluster_time_events import cluster_based_on_time

n_components = 20
OtuMf = OtuMfHandler('otu_IBD_table.csv',
                     'metadata_ok94_ok59.csv',
                     from_QIIME=True)
preproccessed_data = preprocess_data(OtuMf.otu_file,
                                     visualize_data=True,
                                     taxnomy_level=5,
                                     preform_taxnomy_group=False)
otu_after_pca_wo_taxonomy, _ = apply_pca(preproccessed_data,
                                         n_components=n_components,
                                         visualize=False)
# otu_after_pca = OtuMf.add_taxonomy_col_to_new_otu_data(otu_after_pca_wo_taxonomy)
# merged_data_after_pca = OtuMf.merge_mf_with_new_otu_data(otu_after_pca_wo_taxonomy)
# merged_data_with_age = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file['age_in_days'])
# merged_data_with_age = merged_data_with_age[merged_data_with_age.age_in_days.notnull()] # remove NaN days
# merged_data_with_age_group = otu_after_pca_wo_taxonomy.join(OtuMf.mapping_file[['age_group', 'age_in_days','MouseNumber']])
# merged_data_with_age_group = merged_data_with_age_group[merged_data_with_age_group.age_group.notnull()] # remove NaN days

Example #7

Show file

File: example_data_set.py Project: sharon200102/microbiome

    def _read_file(self, title, bactria_as_feature_file, samples_data_file,
                   allow_printing, perform_anna_preprocess):
        features = pd.read_csv(bactria_as_feature_file, header=1)
        cols = list(features.columns)
        # remove non-numeric values
        cols.remove('Feature ID')
        cols.remove('Taxonomy')

        OtuMf = OtuMfHandler(os.path.join(SCRIPT_DIR, bactria_as_feature_file),
                             os.path.join(SCRIPT_DIR, samples_data_file),
                             from_QIIME=True,
                             id_col='Feature ID',
                             taxonomy_col='Taxonomy')

        preproccessed_data = preprocess_data(OtuMf.otu_file,
                                             visualize_data=False,
                                             taxnomy_level=self._taxnomy_level,
                                             taxonomy_col='Taxonomy',
                                             preform_taxnomy_group=True)

        self._preproccessed_data = preproccessed_data
        # drow_data(preproccessed_data)
        # otu_after_pca_wo_taxonomy, _, _ = apply_pca(data_after_log_zcore, n_components=40, visualize=False)

        otu_after_pca_wo_taxonomy, pca_obj, _ = apply_pca(
            preproccessed_data, n_components=n_components, visualize=False)
        self._pca_obj = pca_obj

        index_to_id_map = {}
        id_to_features_map = {}
        for i, row in enumerate(otu_after_pca_wo_taxonomy.values):
            id_to_features_map[otu_after_pca_wo_taxonomy.index[i]] = row
            index_to_id_map[i] = otu_after_pca_wo_taxonomy.index[i]

        self._index_to_id_map = index_to_id_map
        self._id_to_features_map = id_to_features_map
        ids_list = otu_after_pca_wo_taxonomy.index.tolist()
        self._ids_list = ids_list

        XXXXX_column = 'SuccessDescription'
        id_to_tag_map = {}
        for sample in ids_list:
            t = OtuMf.mapping_file.loc[sample, XXXXX_column]
            id_to_tag_map[sample] = t
            if t == 'A1':
                id_to_tag_map[sample] = 1
            else:
                id_to_tag_map[sample] = 0
        self._id_to_tag_map = id_to_tag_map

        # -------------------------------------------- weights !--------------------------------------------
        # calculate weights
        y = list(id_to_tag_map.values())
        classes_sum = [
            np.sum(np.array(y) == unique_class)
            for unique_class in np.unique(np.array(y))
        ]
        classes_ratio = [1 - (a / sum(classes_sum)) for a in classes_sum]
        weights = [classes_ratio[a] for a in np.array(y)]
        self._weight_map = classes_ratio

        # return the list of features and the list of ids in the same order
        feature_list = [id_to_features_map[id] for id in ids_list]
        self._feature_list = feature_list

Example #8

Show file

File: analysis_for_rony_using_grid_search.py Project: dafnamagid/microbiome

    OtuMf = OtuMfHandler(
        os.path.join(SCRIPT_DIR, 'learn dataset',
                     'mucositis_first_table_260219.csv'),
        os.path.join(SCRIPT_DIR, 'learn dataset',
                     'mucositis_mapping_file_first_250219_numbers.csv'),
        from_QIIME=True)

    OtuMf = OtuMfHandler(
        os.path.join(SCRIPT_DIR, 'combined_data', 'dataset',
                     'mucositis_combine_table_260219.csv'),
        os.path.join(SCRIPT_DIR, 'combined_data', 'dataset',
                     'mucositis_mapping_file_combine_250219_numbers.csv'),
        from_QIIME=True)

    preproccessed_data = preprocess_data(OtuMf.otu_file,
                                         visualize_data=False,
                                         taxnomy_level=taxnomy_level)
    otu_after_pca_wo_taxonomy, pca_obj, pca_str = apply_pca(
        preproccessed_data, n_components=n_components, visualize=False)
    with open(f'{file_name}.txt', 'w') as f:
        f.write('-------------- REPORT --------------\n')
        f.write(f'Using taxonomy level of {taxnomy_level} \n')
        f.write(f'Using {n_components} PCA components \n')
        f.write(f'{pca_str}\n\n')

    with open(f'{file_name}_with_grid_results.txt', 'w') as f:
        f.write('-------------- REPORT --------------\n')
        f.write(f'Using taxonomy level of {taxnomy_level} \n')
        f.write(f'Using {n_components} PCA components \n')
        f.write(f'{pca_str}\n\n')