def main(): options = tools.parse_options() data = options.data additional_data = options.additional_data file = open(additional_data + "/subject_name.txt", "r") ids = file.read().split() ids = [int(float(id)) for id in ids] gdf = pd.read_csv(additional_data + '/n300.csv') gdf['subject_cont'] = ids gdf = gdf[['KJØNN', 'subject_cont', 'ALDER']].copy() gdf = gdf.rename(columns={'KJØNN': 'gender', 'ALDER': 'age'}) mat_files = os.listdir(data) n_back_file = list(filter(lambda x: 'nBack' in x, mat_files))[0] face_file = list(filter(lambda x: 'Faces' in x, mat_files))[0] contrasts = [n_back_file, face_file] t_test_scores = pd.DataFrame( columns=['statistic', 'pvalue', 'user group', 'task_name']) scoresdf = pd.DataFrame( columns=['beta', 'pvalue', 'Labels', 'variable', 'task_name']) params = ['age', 'gender'] print(contrasts) for mat_file in contrasts: for param in params: df1, df2, df3, contrast_name = tools.data_extraction( data, 2, mat_file) df1.fillna(df1.mean(), inplace=True) df2.fillna(df2.mean(), inplace=True) df3.fillna(df3.mean(), inplace=True) df1 = pd.merge(df1, gdf, on=['subject_cont'], how='inner') df2 = pd.merge(df2, gdf, on=['subject_cont'], how='inner') df3 = pd.merge(df3, gdf, on=['subject_cont'], how='inner') df = df1.append(df2).append(df3) df = df.loc[:, df.columns.intersection([param, 'label'])] df12 = df1.append(df2) df23 = df2.append(df3) df31 = df3.append(df1) task_name = mat_file.split("_")[0] if param == "age": plot_age_box_plot(df1, df2, df3, df, task_name, options) scoresdf = run_glm_fit(df12, 12, scoresdf, param, task_name) scoresdf = run_glm_fit(df23, 23, scoresdf, param, task_name) scoresdf = run_glm_fit(df31, 31, scoresdf, param, task_name) t_test_scores = t_test(df12, "BD-Sc", t_test_scores, task_name) t_test_scores = t_test(df23, "Sc-Co", t_test_scores, task_name) t_test_scores = t_test(df31, "Co-BD", t_test_scores, task_name) plot_age_dist(df1, df2, df3, task_name, options) plot_gender_dist(options) print("\nGLM fit with age and gender variable used individually\n") print(scoresdf) print("\n\nT-test scores to analyse age distribution\n") print(t_test_scores)
def main(): options = tools.parse_options() start = time.time() if os.path.isfile( options.input ): # if results are already stored then use that as input scoresdf = pd.read_csv(options.input) else: # in previous experiments, if results are not stored then create new dataframe to store the results scoresdf = pd.DataFrame(columns=[ 'Score', 'Type', 'Model', 'Classifier', 'Contrast_name', 'Balanced_accuracy' ]) mat_files = os.listdir(options.data) contrast_list = list( filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files))) n_back_list = list( filter(lambda x: 'nBack' in x and ('2' in x or '3' in x), contrast_list)) faces_list = list( filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x), contrast_list)) relevant_contrast_list = n_back_list + faces_list # extracted nBack 2,3 and Faces 3,4,5 contrasts # Age and gender information along with subject id is extracted file = open(options.additional_data + "/subject_name.txt", "r") ids = file.read().split() ids = [int(float(id)) for id in ids] edf = pd.read_csv(options.additional_data + '/n300.csv') edf['subject_cont'] = ids edf = edf[['KJØNN', 'subject_cont', 'ALDER']] edf = edf.rename(columns={'KJØNN': 'gender', 'ALDER': 'age'}) for contrast in relevant_contrast_list: contrast_name = contrast.split(".")[0] if len(scoresdf[scoresdf["Contrast_name"] == contrast_name]): continue for nClass in range(2, 4, 1): # Considering all classes: Bipolar, Schizo and Control if nClass == 3: df, contrast_name = tools.data_extraction( options.data, nClass, contrast, options.data_type) df = mlu.missing_values(df) df = pd.merge(df, edf, on=['subject_cont'], how='inner') scoresdf = run_no_gender_ml(df, options, 123, scoresdf, contrast_name) # Considering combination of 2 classes: Bipolar-Schizo, Schizo-Control and Control-Bipolar elif nClass == 2: df1, df2, df3, contrast_name = tools.data_extraction( options.data, nClass, contrast, options.data_type) # Combining two pairs off all combination df12 = df1.append(df2) df23 = df2.append(df3) df31 = df3.append(df1) # Handle missing values df12 = mlu.missing_values(df12) df23 = mlu.missing_values(df23) df31 = mlu.missing_values(df31) # Adding age and gender data for Standardization purpose. This additional data will be removed in # data preprocessing df12 = pd.merge(df12, edf, on=['subject_cont'], how='inner') df23 = pd.merge(df23, edf, on=['subject_cont'], how='inner') df31 = pd.merge(df31, edf, on=['subject_cont'], how='inner') scoresdf = run_no_gender_ml(df12, options, 12, scoresdf, contrast_name) scoresdf = run_no_gender_ml(df23, options, 23, scoresdf, contrast_name) scoresdf = run_no_gender_ml(df31, options, 31, scoresdf, contrast_name) scoresdf.to_csv(options.output + "no_gender_individual.csv", index=False) print( "It took %s seconds to run %s iterations for %s model after removing gender effect" % (time.time() - start, options.number_iterations, options.model))
X, y = mlu.get_features_labels(shuffle(df)) X = min_max_scaler.fit_transform(X) clf_cv.fit(X, y) coefs_.append(clf_cv.coef_.ravel().copy()) a = np.asarray(coefs_) np.savetxt(out + "%s_%s_logisticLasso.csv" % (contrast_name, classifier_no), a, delimiter=",") if __name__ == '__main__': print("Logistic Regression with Lasso penalty") options = tools.parse_options() mat_files = os.listdir(options.data) contrast_list = list( filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files))) n_back_list = list( filter(lambda x: 'nBack' in x and ('2' in x or '3' in x), contrast_list)) faces_list = list( filter(lambda x: 'Faces' in x and ('5' in x or '4' in x), contrast_list)) relevant_mat_files = n_back_list + faces_list start = time() for mat_file in relevant_mat_files: print(mat_file) df1, df2, df3, contrast_name = tools.data_extraction(
def main(): print("NI Thesis") options = tools.parse_options() start = time.time() if options.combine: o_subtitle = 'combined' else: o_subtitle = 'individual' if os.path.isfile(options.input): scoresdf = pd.read_csv(options.input) else: scoresdf = pd.DataFrame(columns=[ 'Score', 'Type', 'Model', 'Classifier', 'Contrast_name', 'Balanced_accuracy' ]) mat_files = os.listdir(options.data) print(mat_files) #To get matfiles which does not ends with 389.mat or 487.mat. Selecting only minified mat files like #contrast_list = ['Faces_con_0003.mat', 'Faces_con_0002.mat', 'Faces_con_0001.mat', 'Faces_con_0005.mat', # 'Faces_con_0004.mat', 'nBack_con_0001.mat', 'nBack_con_0002.mat', 'nBack_con_0003.mat'] contrast_list = list( filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files))) #TODO remove this for old Data contrast_list = mat_files combi_contrast = contrast_permutation(contrast_list) if options.combine: clist = combi_contrast else: clist = contrast_list for i in range(len(clist)): #Getting Contrast name if options.combine: c1_name = clist[i][0].split(".")[0] c2_name = clist[i][1].split(",")[0] contrast_name = c1_name + '&' + c2_name else: contrast_name = clist[i].split(".")[0] # Checking if the training is already made for the particular contrast # TODO Uncomment this for checking if contrast is present in the file if len(scoresdf[scoresdf['Contrast_name'] == contrast_name]): continue for nClass in range(2, 4, 1): if nClass == 3: # Read Data and put it into panda data frame. Initially considering only means if options.combine: df, contrast_name = tools.combine_contrast( options.data, nClass, clist[i][0], clist[i][1], options.data_type) else: df, contrast_name = tools.data_extraction( options.data, nClass, clist[i], options.data_type) df = mlu.missing_values(df) scoresdf = run_basic_ml(df, options, 123, scoresdf, contrast_name) elif nClass == 2: if options.combine: df1, df2, df3, contrast_name = tools.combine_contrast( options.data, nClass, clist[i][0], clist[i][1], options.data_type) else: df1, df2, df3, contrast_name = tools.data_extraction( options.data, nClass, clist[i], options.data_type) # Combining two pairs off all combination df12 = df1.append(df2) df23 = df2.append(df3) df31 = df3.append(df1) # Handle missing values df12 = mlu.missing_values(df12) df23 = mlu.missing_values(df23) df31 = mlu.missing_values(df31) scoresdf = run_basic_ml(df12, options, 12, scoresdf, contrast_name) scoresdf = run_basic_ml(df23, options, 23, scoresdf, contrast_name) scoresdf = run_basic_ml(df31, options, 31, scoresdf, contrast_name) scoresdf.to_csv(options.output + "%s.csv" % (o_subtitle), index=False) print("It took %s seconds to run %s iterations for %s model" % (time.time() - start, options.number_iterations, options.model)) logger.info( "It took %s seconds to run %s iterations for all models for not normalized" % (time.time() - start, options.number_iterations))
def main(): options = tools.parse_options() start = time.time() if os.path.isfile(options.input): scoresdf = pd.read_csv(options.input) else: scoresdf = pd.DataFrame(columns=['Score', 'Type', 'Model', 'Classifier', 'Contrast_name', 'Balanced_accuracy']) if options.combine: o_subtitle = 'combined' else: o_subtitle = 'individual' ## Gender information and adding it as label to the data by linking the subject_cont file = open(options.additional_data + "/subject_name.txt", "r") ids = file.read().split() ids = [int(float(id)) for id in ids] gdf = pd.read_csv(options.additional_data + '/n300.csv') gdf['subject_cont'] = ids gdf = gdf[['KJØNN', 'subject_cont','ALDER']].copy() gdf = gdf.rename(columns={'KJØNN':'gender', 'ALDER':'age'}) label = 'gender' label = 'age' label = options.age_gender mat_files = os.listdir(options.data) contrast_list = list(filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files))) n_back_list = list(filter(lambda x: 'nBack' in x and ('2' in x or '3' in x), contrast_list)) faces_list = list(filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x), contrast_list)) relevant_mat_files = n_back_list + faces_list relevant_mat_files = relevant_mat_files[0:2] print(relevant_mat_files) for mat_file in relevant_mat_files: print(mat_file) for nClass in range(2, 4, 1): if nClass == 3: df, contrast_name = tools.data_extraction(options.data, nClass, mat_file, options.data_type) # Adding Age and gender to the dataframe df = pd.merge(df, gdf, on=['subject_cont'], how='inner') df = mlu.missing_values(df) scoresdf = run_gender_cor(df, options, 123, scoresdf, contrast_name, label) elif nClass == 2: df1, df2, df3, contrast_name = tools.data_extraction(options.data, nClass, mat_file, options.data_type) #Adding Age and gender to the dataframe df1 = pd.merge(df1, gdf, on=['subject_cont'], how='inner') df2 = pd.merge(df2, gdf, on=['subject_cont'], how='inner') df3 = pd.merge(df3, gdf, on=['subject_cont'], how='inner') # Combining two pairs off all combination df12 = df1.append(df2) df23 = df2.append(df3) df31 = df3.append(df1) # Handle missing values df12 = mlu.missing_values(df12) df23 = mlu.missing_values(df23) df31 = mlu.missing_values(df31) df1 = mlu.missing_values(df1) df2 = mlu.missing_values(df2) df3 = mlu.missing_values(df3) scoresdf = run_gender_cor(df12, options, 12, scoresdf, contrast_name, label) scoresdf = run_gender_cor(df23, options, 23, scoresdf, contrast_name, label) scoresdf = run_gender_cor(df31, options, 31, scoresdf, contrast_name, label) scoresdf = run_gender_cor(df1, options, 1, scoresdf, contrast_name, label) scoresdf = run_gender_cor(df2, options, 2, scoresdf, contrast_name, label) scoresdf = run_gender_cor(df3, options, 3, scoresdf, contrast_name, label) scoresdf.to_csv(options.output + "%s.csv" % (o_subtitle), index=False) print("It took %s seconds to run %s iterations for %s model" % (time.time() - start, options.number_iterations, options.model))
def main(): options = tools.parse_options() start = time.time() if options.combine: o_subtitle = 'combined' else: o_subtitle = 'individual' if os.path.isfile( options.input ): # if results are already stored then use that as input scoresdf = pd.read_csv(options.input) else: # in previous experiments, if results are not stored then create new dataframe to store the results scoresdf = pd.DataFrame(columns=[ 'Score', 'Type', 'Model', 'Classifier', 'Contrast_name', 'Balanced_accuracy' ]) mat_files = os.listdir(options.data) contrast_list = list( filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files))) combi_contrast = contrast_permutation(contrast_list) if options.combine: clist = combi_contrast else: clist = contrast_list for i in range(len(clist)): #Getting Contrast name if options.combine: c1_name = clist[i][0].split(".")[0] c2_name = clist[i][1].split(",")[0] contrast_name = c1_name + '&' + c2_name else: contrast_name = clist[i].split(".")[0] # Checking if the training is already made for the particular contrast # TODO Uncomment this for checking if contrast is present in the file if len(scoresdf[scoresdf['Contrast_name'] == contrast_name]): continue for nClass in range(2, 4, 1): if nClass == 3: # Read Data and put it into panda data frame. Initially considering only means if options.combine: df, contrast_name = tools.combine_contrast( options.data, nClass, clist[i][0], clist[i][1], options.data_type) else: df, contrast_name = tools.data_extraction( options.data, nClass, clist[i], options.data_type) df = mlu.missing_values(df) scoresdf = run_basic_ml(df, options, 123, scoresdf, contrast_name) elif nClass == 2: if options.combine: df1, df2, df3, contrast_name = tools.combine_contrast( options.data, nClass, clist[i][0], clist[i][1], options.data_type) else: df1, df2, df3, contrast_name = tools.data_extraction( options.data, nClass, clist[i], options.data_type) # Combining two pairs off all combination df12 = df1.append(df2) df23 = df2.append(df3) df31 = df3.append(df1) # Handle missing values df12 = mlu.missing_values(df12) df23 = mlu.missing_values(df23) df31 = mlu.missing_values(df31) scoresdf = run_basic_ml(df12, options, 12, scoresdf, contrast_name) scoresdf = run_basic_ml(df23, options, 23, scoresdf, contrast_name) scoresdf = run_basic_ml(df31, options, 31, scoresdf, contrast_name) scoresdf.to_csv(options.output + "basic_%s.csv" % (o_subtitle), index=False) print("It took %s seconds to run %s iterations for %s model" % (time.time() - start, options.number_iterations, options.model)) print( "It took %s seconds to run %s iterations for %s model after removing gender effect" % (time.time() - start, options.number_iterations, options.model))
def main(): options = tools.parse_options() start = time.time() ## Get Age, Gender and Subject_cont information ### file = open(options.additional_data + "subject_name.txt", "r") ids = file.read().split() ids = [int(float(id)) for id in ids] gdf = pd.read_csv(options.additional_data + 'n300.csv') gdf.loc[:, 'subject_cont'] = ids gdf = gdf[['KJØNN', 'subject_cont', 'ALDER']] gdf = gdf.rename(columns={'KJØNN': 'gender', 'ALDER': 'age'}) mat_files = os.listdir(options.data) contrast_list = list( filter(None, filter(lambda x: re.search('.*_.....mat', x), mat_files))) n_back_list = list( filter(lambda x: 'nBack' in x and ('2' in x or '3' in x), contrast_list)) faces_list = list( filter(lambda x: 'Faces' in x and ('5' in x or '4' in x or '3' in x), contrast_list)) relevant_contrast_list = n_back_list + faces_list # extracted nBack 2,3 and Faces 3,4,5 contrasts if os.path.isfile(options.input): scoresdf = pd.read_csv(options.input) else: scoresdf = pd.DataFrame(columns=[ 'feature', 'beta_f', 'beta_a', 'beta_g', 'pvalue_f', 'pvalue_a', 'pvalue_g', 'Contrast_name', 'Labels' ]) for contrast in relevant_contrast_list: contrast_name = contrast.split(".")[0] if len(scoresdf[scoresdf["Contrast_name"] == contrast_name]): continue df1, df2, df3, contrast_name = tools.data_extraction( options.data, 2, contrast, options.data_type) # Combining two pairs off all combination df12 = df1.append(df2) df23 = df2.append(df3) df31 = df3.append(df1) # Handle missing values df12 = mlu.missing_values(df12) df23 = mlu.missing_values(df23) df31 = mlu.missing_values(df31) # Adding age and gender data for Standardization purpose. This additional data will be removed in # data preprocessing df12 = pd.merge(df12, gdf, on=['subject_cont'], how='inner') df23 = pd.merge(df23, gdf, on=['subject_cont'], how='inner') df31 = pd.merge(df31, gdf, on=['subject_cont'], how='inner') scoresdf = run_glm_fit(df12, 12, contrast_name, scoresdf) scoresdf = run_glm_fit(df23, 23, contrast_name, scoresdf) scoresdf = run_glm_fit(df31, 31, contrast_name, scoresdf) scoresdf.to_csv(options.output + "individual.csv", index=False) fdr_analysis(options)