Python missing_values Examples, ml_utilities.missing_values Python Examples

Example #1

0

Show file

File: logistic_lasso.py Project: varshakirani/thesis

               contrast_list))
    faces_list = list(
        filter(lambda x: 'Faces' in x and ('5' in x or '4' in x),
               contrast_list))
    relevant_mat_files = n_back_list + faces_list
    start = time()
    for mat_file in relevant_mat_files:
        print(mat_file)
        df1, df2, df3, contrast_name = tools.data_extraction(
            options.data, 2, mat_file, 'face_aal')
        df1 = shuffle(df1)
        df2 = shuffle(df2)
        df3 = shuffle(df3)

        # Combining two pairs off all combination
        df12 = df1.append(df2)
        df23 = df2.append(df3)
        df31 = df3.append(df1)

        # Handle missing values
        df12 = mlu.missing_values(df12)
        df23 = mlu.missing_values(df23)
        df31 = mlu.missing_values(df31)

        run_logistic_lasso(df12, contrast_name, 12, options.output)
        run_logistic_lasso(df23, contrast_name, 23, options.output)
        run_logistic_lasso(df31, contrast_name, 31, options.output)

    print("It took %0.3fs to do 100 times lasso on all contrasts" %
          (time() - start))

Example #2

0

Show file

File: no_gender.py Project: varshakirani/brain_data_analysis

def main():
    options = tools.parse_options()
    start = time.time()
    if os.path.isfile(
            options.input
    ):  # if results are already stored then use that as input
        scoresdf = pd.read_csv(options.input)
    else:  # in previous experiments, if results are not stored then create new dataframe to store the results
        scoresdf = pd.DataFrame(columns=[
            'Score', 'Type', 'Model', 'Classifier', 'Contrast_name',
            'Balanced_accuracy'
        ])

    mat_files = os.listdir(options.data)
    contrast_list = list(
        filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files)))
    n_back_list = list(
        filter(lambda x: 'nBack' in x and ('2' in x or '3' in x),
               contrast_list))
    faces_list = list(
        filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x),
               contrast_list))
    relevant_contrast_list = n_back_list + faces_list  # extracted nBack 2,3 and Faces 3,4,5 contrasts

    # Age and gender information along with subject id is extracted
    file = open(options.additional_data + "/subject_name.txt", "r")
    ids = file.read().split()
    ids = [int(float(id)) for id in ids]
    edf = pd.read_csv(options.additional_data + '/n300.csv')
    edf['subject_cont'] = ids
    edf = edf[['KJØNN', 'subject_cont', 'ALDER']]
    edf = edf.rename(columns={'KJØNN': 'gender', 'ALDER': 'age'})

    for contrast in relevant_contrast_list:
        contrast_name = contrast.split(".")[0]
        if len(scoresdf[scoresdf["Contrast_name"] == contrast_name]):
            continue

        for nClass in range(2, 4, 1):
            #  Considering all classes: Bipolar, Schizo and Control
            if nClass == 3:
                df, contrast_name = tools.data_extraction(
                    options.data, nClass, contrast, options.data_type)
                df = mlu.missing_values(df)
                df = pd.merge(df, edf, on=['subject_cont'], how='inner')
                scoresdf = run_no_gender_ml(df, options, 123, scoresdf,
                                            contrast_name)

            #  Considering combination of 2 classes: Bipolar-Schizo, Schizo-Control and Control-Bipolar
            elif nClass == 2:
                df1, df2, df3, contrast_name = tools.data_extraction(
                    options.data, nClass, contrast, options.data_type)

                # Combining two pairs off all combination
                df12 = df1.append(df2)
                df23 = df2.append(df3)
                df31 = df3.append(df1)

                # Handle missing values
                df12 = mlu.missing_values(df12)
                df23 = mlu.missing_values(df23)
                df31 = mlu.missing_values(df31)

                # Adding age and gender data for Standardization purpose. This additional data will be removed in
                # data preprocessing
                df12 = pd.merge(df12, edf, on=['subject_cont'], how='inner')
                df23 = pd.merge(df23, edf, on=['subject_cont'], how='inner')
                df31 = pd.merge(df31, edf, on=['subject_cont'], how='inner')

                scoresdf = run_no_gender_ml(df12, options, 12, scoresdf,
                                            contrast_name)
                scoresdf = run_no_gender_ml(df23, options, 23, scoresdf,
                                            contrast_name)
                scoresdf = run_no_gender_ml(df31, options, 31, scoresdf,
                                            contrast_name)

        scoresdf.to_csv(options.output + "no_gender_individual.csv",
                        index=False)

    print(
        "It took %s seconds to run %s iterations for %s model after removing gender effect"
        % (time.time() - start, options.number_iterations, options.model))

Example #3

0

Show file

def main():
    print("NI Thesis")
    options = tools.parse_options()
    start = time.time()

    if options.combine:
        o_subtitle = 'combined'
    else:
        o_subtitle = 'individual'

    if os.path.isfile(options.input):
        scoresdf = pd.read_csv(options.input)
    else:
        scoresdf = pd.DataFrame(columns=[
            'Score', 'Type', 'Model', 'Classifier', 'Contrast_name',
            'Balanced_accuracy'
        ])

    mat_files = os.listdir(options.data)
    print(mat_files)
    #To get matfiles which does not ends with 389.mat or 487.mat. Selecting only minified mat files like
    #contrast_list = ['Faces_con_0003.mat', 'Faces_con_0002.mat', 'Faces_con_0001.mat', 'Faces_con_0005.mat',
    #                 'Faces_con_0004.mat', 'nBack_con_0001.mat', 'nBack_con_0002.mat', 'nBack_con_0003.mat']

    contrast_list = list(
        filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files)))
    #TODO remove this for old Data
    contrast_list = mat_files
    combi_contrast = contrast_permutation(contrast_list)

    if options.combine:
        clist = combi_contrast
    else:
        clist = contrast_list

    for i in range(len(clist)):

        #Getting Contrast name
        if options.combine:
            c1_name = clist[i][0].split(".")[0]
            c2_name = clist[i][1].split(",")[0]
            contrast_name = c1_name + '&' + c2_name
        else:
            contrast_name = clist[i].split(".")[0]

        # Checking if the training is already made for the particular contrast
        # TODO Uncomment this for checking if contrast is present in the file
        if len(scoresdf[scoresdf['Contrast_name'] == contrast_name]):
            continue

        for nClass in range(2, 4, 1):

            if nClass == 3:

                # Read Data and put it into panda data frame. Initially considering only means
                if options.combine:
                    df, contrast_name = tools.combine_contrast(
                        options.data, nClass, clist[i][0], clist[i][1],
                        options.data_type)
                else:
                    df, contrast_name = tools.data_extraction(
                        options.data, nClass, clist[i], options.data_type)
                df = mlu.missing_values(df)
                scoresdf = run_basic_ml(df, options, 123, scoresdf,
                                        contrast_name)

            elif nClass == 2:

                if options.combine:
                    df1, df2, df3, contrast_name = tools.combine_contrast(
                        options.data, nClass, clist[i][0], clist[i][1],
                        options.data_type)

                else:
                    df1, df2, df3, contrast_name = tools.data_extraction(
                        options.data, nClass, clist[i], options.data_type)
                # Combining two pairs off all combination
                df12 = df1.append(df2)
                df23 = df2.append(df3)
                df31 = df3.append(df1)

                # Handle missing values
                df12 = mlu.missing_values(df12)
                df23 = mlu.missing_values(df23)
                df31 = mlu.missing_values(df31)

                scoresdf = run_basic_ml(df12, options, 12, scoresdf,
                                        contrast_name)
                scoresdf = run_basic_ml(df23, options, 23, scoresdf,
                                        contrast_name)
                scoresdf = run_basic_ml(df31, options, 31, scoresdf,
                                        contrast_name)

        scoresdf.to_csv(options.output + "%s.csv" % (o_subtitle), index=False)

    print("It took %s seconds to run %s iterations for %s model" %
          (time.time() - start, options.number_iterations, options.model))

    logger.info(
        "It took %s seconds to run %s iterations for all models for not normalized"
        % (time.time() - start, options.number_iterations))

Example #4

0

Show file

File: gender_age_cor.py Project: varshakirani/thesis

def main():
    options = tools.parse_options()
    start = time.time()
    if os.path.isfile(options.input):
        scoresdf = pd.read_csv(options.input)
    else:
        scoresdf = pd.DataFrame(columns=['Score', 'Type', 'Model', 'Classifier', 'Contrast_name', 'Balanced_accuracy'])

    if options.combine:
        o_subtitle = 'combined'
    else:
        o_subtitle = 'individual'

    ## Gender information and adding it as label to the data by linking the subject_cont
    file = open(options.additional_data + "/subject_name.txt", "r")
    ids = file.read().split()
    ids = [int(float(id)) for id in ids]
    gdf = pd.read_csv(options.additional_data + '/n300.csv')
    gdf['subject_cont'] = ids
    gdf = gdf[['KJØNN', 'subject_cont','ALDER']].copy()
    gdf = gdf.rename(columns={'KJØNN':'gender', 'ALDER':'age'})


    label = 'gender'
    label = 'age'
    label = options.age_gender

    mat_files = os.listdir(options.data)
    contrast_list = list(filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files)))
    n_back_list = list(filter(lambda x: 'nBack' in x and ('2' in x or '3' in x), contrast_list))
    faces_list = list(filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x), contrast_list))
    relevant_mat_files = n_back_list + faces_list
    relevant_mat_files = relevant_mat_files[0:2]
    print(relevant_mat_files)
    for mat_file in relevant_mat_files:
        print(mat_file)
        for nClass in range(2, 4, 1):
            if nClass == 3:
                df, contrast_name = tools.data_extraction(options.data, nClass, mat_file, options.data_type)
                # Adding Age and gender to the dataframe
                df = pd.merge(df, gdf, on=['subject_cont'], how='inner')
                df = mlu.missing_values(df)
                scoresdf = run_gender_cor(df, options, 123, scoresdf, contrast_name, label)

            elif nClass == 2:
                df1, df2, df3, contrast_name = tools.data_extraction(options.data, nClass, mat_file, options.data_type)

                #Adding Age and gender to the dataframe
                df1 = pd.merge(df1, gdf, on=['subject_cont'], how='inner')
                df2 = pd.merge(df2, gdf, on=['subject_cont'], how='inner')
                df3 = pd.merge(df3, gdf, on=['subject_cont'], how='inner')

                # Combining two pairs off all combination
                df12 = df1.append(df2)
                df23 = df2.append(df3)
                df31 = df3.append(df1)

                # Handle missing values
                df12 = mlu.missing_values(df12)
                df23 = mlu.missing_values(df23)
                df31 = mlu.missing_values(df31)

                df1 = mlu.missing_values(df1)
                df2 = mlu.missing_values(df2)
                df3 = mlu.missing_values(df3)


                scoresdf = run_gender_cor(df12, options, 12, scoresdf, contrast_name, label)
                scoresdf = run_gender_cor(df23, options, 23, scoresdf, contrast_name, label)
                scoresdf = run_gender_cor(df31, options, 31, scoresdf, contrast_name, label)

                scoresdf = run_gender_cor(df1, options, 1, scoresdf, contrast_name, label)
                scoresdf = run_gender_cor(df2, options, 2, scoresdf, contrast_name, label)
                scoresdf = run_gender_cor(df3, options, 3, scoresdf, contrast_name, label)
        scoresdf.to_csv(options.output + "%s.csv" % (o_subtitle), index=False)

    print("It took %s seconds to run %s iterations for %s model" % (time.time() - start, options.number_iterations,
                                                                    options.model))

Example #5

0

Show file

    label = options.age_gender

    mat_files = os.listdir(options.data)
    contrast_list = list(filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files)))
    n_back_list = list(filter(lambda x: 'nBack' in x and ('2' in x or '3' in x), contrast_list))
    faces_list = list(filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x), contrast_list))
    relevant_mat_files = n_back_list + faces_list

    for mat_file in relevant_mat_files:
        print(mat_file)
        for nClass in range(2, 4, 1):
            if nClass == 3:
                df, contrast_name = tools.data_extraction(options.data, nClass, mat_file, options.data_type)
                # Adding Age and gender to the dataframe
                df = pd.merge(df, gdf, on=['subject_cont'], how='inner')
                df = mlu.missing_values(df)
                scoresdf = run_gender_cor(df, options, 123, scoresdf, contrast_name, label)

            elif nClass == 2:
                df1, df2, df3, contrast_name = tools.data_extraction(options.data, nClass, mat_file, options.data_type)

                #Adding Age and gender to the dataframe
                df1 = pd.merge(df1, gdf, on=['subject_cont'], how='inner')
                df2 = pd.merge(df2, gdf, on=['subject_cont'], how='inner')
                df3 = pd.merge(df3, gdf, on=['subject_cont'], how='inner')

                # Combining two pairs off all combination
                df12 = df1.append(df2)
                df23 = df2.append(df3)
                df31 = df3.append(df1)

Example #6

0

Show file

def main():
    options = tools.parse_options()
    start = time.time()
    if options.combine:
        o_subtitle = 'combined'
    else:
        o_subtitle = 'individual'

    if os.path.isfile(
            options.input
    ):  # if results are already stored then use that as input
        scoresdf = pd.read_csv(options.input)
    else:  # in previous experiments, if results are not stored then create new dataframe to store the results
        scoresdf = pd.DataFrame(columns=[
            'Score', 'Type', 'Model', 'Classifier', 'Contrast_name',
            'Balanced_accuracy'
        ])

    mat_files = os.listdir(options.data)
    contrast_list = list(
        filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files)))
    combi_contrast = contrast_permutation(contrast_list)

    if options.combine:
        clist = combi_contrast
    else:
        clist = contrast_list

    for i in range(len(clist)):

        #Getting Contrast name
        if options.combine:
            c1_name = clist[i][0].split(".")[0]
            c2_name = clist[i][1].split(",")[0]
            contrast_name = c1_name + '&' + c2_name
        else:
            contrast_name = clist[i].split(".")[0]

        # Checking if the training is already made for the particular contrast
        # TODO Uncomment this for checking if contrast is present in the file
        if len(scoresdf[scoresdf['Contrast_name'] == contrast_name]):
            continue

        for nClass in range(2, 4, 1):

            if nClass == 3:

                # Read Data and put it into panda data frame. Initially considering only means
                if options.combine:
                    df, contrast_name = tools.combine_contrast(
                        options.data, nClass, clist[i][0], clist[i][1],
                        options.data_type)
                else:
                    df, contrast_name = tools.data_extraction(
                        options.data, nClass, clist[i], options.data_type)
                df = mlu.missing_values(df)
                scoresdf = run_basic_ml(df, options, 123, scoresdf,
                                        contrast_name)

            elif nClass == 2:

                if options.combine:
                    df1, df2, df3, contrast_name = tools.combine_contrast(
                        options.data, nClass, clist[i][0], clist[i][1],
                        options.data_type)

                else:
                    df1, df2, df3, contrast_name = tools.data_extraction(
                        options.data, nClass, clist[i], options.data_type)
                # Combining two pairs off all combination
                df12 = df1.append(df2)
                df23 = df2.append(df3)
                df31 = df3.append(df1)

                # Handle missing values
                df12 = mlu.missing_values(df12)
                df23 = mlu.missing_values(df23)
                df31 = mlu.missing_values(df31)

                scoresdf = run_basic_ml(df12, options, 12, scoresdf,
                                        contrast_name)
                scoresdf = run_basic_ml(df23, options, 23, scoresdf,
                                        contrast_name)
                scoresdf = run_basic_ml(df31, options, 31, scoresdf,
                                        contrast_name)

        scoresdf.to_csv(options.output + "basic_%s.csv" % (o_subtitle),
                        index=False)

    print("It took %s seconds to run %s iterations for %s model" %
          (time.time() - start, options.number_iterations, options.model))

    print(
        "It took %s seconds to run %s iterations for %s model after removing gender effect"
        % (time.time() - start, options.number_iterations, options.model))

Example #7

0

Show file

File: association_analysis.py Project: varshakirani/brain_data_analysis

def main():

    options = tools.parse_options()
    start = time.time()

    ## Get Age, Gender and Subject_cont information ###

    file = open(options.additional_data + "subject_name.txt", "r")
    ids = file.read().split()
    ids = [int(float(id)) for id in ids]
    gdf = pd.read_csv(options.additional_data + 'n300.csv')
    gdf.loc[:, 'subject_cont'] = ids
    gdf = gdf[['KJØNN', 'subject_cont', 'ALDER']]
    gdf = gdf.rename(columns={'KJØNN': 'gender', 'ALDER': 'age'})

    mat_files = os.listdir(options.data)
    contrast_list = list(
        filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files)))
    n_back_list = list(
        filter(lambda x: 'nBack' in x and ('2' in x or '3' in x),
               contrast_list))
    faces_list = list(
        filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x),
               contrast_list))
    relevant_contrast_list = n_back_list + faces_list  # extracted nBack 2,3 and Faces 3,4,5 contrasts

    if os.path.isfile(options.input):
        scoresdf = pd.read_csv(options.input)
    else:
        scoresdf = pd.DataFrame(columns=[
            'feature', 'beta_f', 'beta_a', 'beta_g', 'pvalue_f', 'pvalue_a',
            'pvalue_g', 'Contrast_name', 'Labels'
        ])

    for contrast in relevant_contrast_list:
        contrast_name = contrast.split(".")[0]
        if len(scoresdf[scoresdf["Contrast_name"] == contrast_name]):
            continue

        df1, df2, df3, contrast_name = tools.data_extraction(
            options.data, 2, contrast, options.data_type)

        # Combining two pairs off all combination
        df12 = df1.append(df2)
        df23 = df2.append(df3)
        df31 = df3.append(df1)

        # Handle missing values
        df12 = mlu.missing_values(df12)
        df23 = mlu.missing_values(df23)
        df31 = mlu.missing_values(df31)

        # Adding age and gender data for Standardization purpose. This additional data will be removed in
        # data preprocessing
        df12 = pd.merge(df12, gdf, on=['subject_cont'], how='inner')
        df23 = pd.merge(df23, gdf, on=['subject_cont'], how='inner')
        df31 = pd.merge(df31, gdf, on=['subject_cont'], how='inner')

        scoresdf = run_glm_fit(df12, 12, contrast_name, scoresdf)
        scoresdf = run_glm_fit(df23, 23, contrast_name, scoresdf)
        scoresdf = run_glm_fit(df31, 31, contrast_name, scoresdf)

        scoresdf.to_csv(options.output + "individual.csv", index=False)

    fdr_analysis(options)