def __init__(self, all_vars): self.project = all_vars.params.project self.project_vars = all_vars.projects[self.project] self.stats_paths = self.project_vars['STATS_PATHS'] self.stats_params = self.project_vars['STATS_PARAMS'] self.group_col = self.project_vars['group_col'] self.dir_stats_home = self.stats_paths["STATS_HOME"] self.atlas = ('DK', 'DS', 'DKDS')[1] self.get_steps(all_vars) print( f' Performing statistical analysis in folder: {self.dir_stats_home}' ) print(' materials located at: {:<50}'.format( self.project_vars['materials_DIR'][1])) print(' file for analysis: {:<50}'.format( self.project_vars['fname_groups'])) print(' id column: {:<50}'.format(str(self.project_vars['id_col']))) print(' group column: {:<50}'.format( str(self.project_vars['group_col']))) # print(' variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm']))) self.tab = Table() self.preproc = preprocessing.Preprocess() self.df_user_stats, self.df_final_grid,\ self.df_adjusted,\ self.cols_X,\ self.groups = MakeGrid(self.project_vars).grid()
def get_stats_df(len_df_X, atlas, nr_threads, env_name, time_started): ''' script to save the parameters that are used for each specific analysis ''' import sklearn import matplotlib stats = Table().get_clean_df() d = { 'pandas version': Table().pd_ver, 'numpy version': np.__version__, 'matplotlib version': matplotlib.__version__, 'sklearn version': sklearn.__version__, 'number of iterations': definitions.prediction_defs['NUM_ITER'], 'atlas': atlas, 'nr of features': len_df_X, 'nr of threads': nr_threads, 'remote name': env_name, 'analysis started at': time_started, } i = 0 for key in d: stats.at[i, 'stats'] = key stats.at[i, 'values'] = d[key] i += 1 return stats
def __init__(self, df, params_y, ls_cols4anova, path2save, p_thresh=0.05, intercept_thresh=0.05, print_not_FS=False): self.df = df self.params_y = params_y self.ls_cols4anova = ls_cols4anova self.sig_cols = dict() self.tab = Table() self.print_not_FS = print_not_FS self.fs_struc_meas = fs_definitions.GetFSStructureMeasurement() self.run_anova(p_thresh, intercept_thresh, path2save)
def __init__(self, df, group_col, ls_cols, groups, path_save_res, p_thresh=0.05): self.df = df self.group_col = group_col self.ls_cols = ls_cols self.groups = groups self.path_save_res = path_save_res self.ls_meas = get_names_of_measurements() self.ls_struct = get_names_of_structures() self.res_ttest = self.compute_ttest_for_col(p_thresh) self.tab = Table()
def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed, f_GLM_group, FS_GLM_dir): self.proj_vars = proj_vars self.vars_fs = fs_vars self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR'] self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED'] self.f_ids_processed = f_ids_processed self.f_GLM_group = f_GLM_group self.FS_GLM_dir = FS_GLM_dir self.archive_type = '.zip' self.tab = Table() self.miss = dict() self.ids_4fs_glm = dict() self.df = self.tab.get_df(self.f_GLM_group) self.bids_ids = self.df[self.proj_vars['id_col']].tolist() self.ids_exclude_glm = os.path.join(self.FS_GLM_dir, 'excluded_from_glm.json')
def linreg_moderation_results(df_X_linreg, ls_cols_X_atlas, group_param, regression_param, path_dir_save_results, group): '''perform moderation analysis Args: df_X_linreg: pandas.DataFrame with columns and data for analysis ls_cols_X_atlas: ls of cols from df_X_linreg that will be used for linear regression analysis group_param: str, name of the column from df_X_linreg that was used to create the groups regression_param: str, name of the column from df_X_linreg that will be used for regression analysis (e.g., Age) path_dir_save: abspath to save csv file group: group name to use for the results csv file Return: none creates csv file ''' d_result = compute_linreg_data(df_X_linreg, ls_cols_X_atlas, group_param, regression_param) df_result = Table().create_df_from_dict(d_result) Table().save_df( df_result, path.join(path_dir_save_results, f'linreg_moderation_{group}.csv'))
def __init__(self, file_abspath, result_abspath): from stats.db_processing import Table self.contrasts = fs_definitions.GLMcontrasts['contrasts'] self.get_explanations() self.col_4constrasts = "Contrast" self.header = ("ClusterNo", "Max", "VtxMax", "Size(mm^2)", "TalX", "TalY", "TalZ", "CWP", "CWPLow", "CWPHi", "NVtxs", "WghtVtx", "Annot", self.col_4constrasts, "Explanation") self.length_matrix = len(self.header) self.content = open(file_abspath, 'r').readlines() self.result_abspath = result_abspath self.tab = Table() self.ls_vals_2chk = self.contrasts.keys() self.run()
def get_groups_and_variables(self, proj_vars, GLM_file_group, vars_fs): """creating working variables and dictionaries """ cols_2use = proj_vars["variables_for_glm"] + [ self.id_col, self.group_col ] df_groups_clin = Table().get_df_with_columns(GLM_file_group, cols_2use) self.ls_groups = pd.unique(df_groups_clin[self.group_col]).tolist() self.ids = self.get_ids_ready4glm(df_groups_clin[self.id_col].tolist(), vars_fs) d_init = df_groups_clin.to_dict() self.d_subjid = {} self.ls_vars_stats = [key for key in d_init if key != self.id_col] for rownr in d_init[self.id_col]: _id = d_init[self.id_col][rownr] if _id in self.ids: self.d_subjid[_id] = {} for var in self.ls_vars_stats: self.d_subjid[_id][var] = d_init[var][rownr] self.ls_vars_stats.remove(self.group_col) self.make_subjects_per_group(df_groups_clin)
def save_features(dic_feat_comps, expl_variance, file2save, img2save, lang="EN"): """features extracted from PCA are being saved to a table and image Args: dic_feat_comps = {feature_name: explained_variance} expl_variance = PCA explained_variance_ file2save = abspath to the csv file to save the table img2save = abspath to the image png file to save the table lang = language used to describe the results """ df_feat_comps = Table().create_df(dic_feat_comps.values(), index_col=dic_feat_comps.keys(), cols=['explained_variance']) df_feat_comps.to_csv(file2save) plotting.plot_simple(vals=np.cumsum(expl_variance), xlabel=params_lang[lang]['nr_components'], ylabel=params_lang[lang]['expl_cum_var'], path_to_save_file=img2save)
class ttest_do(): def __init__(self, df, group_col, ls_cols, groups, path_save_res, p_thresh=0.05): self.df = df self.group_col = group_col self.ls_cols = ls_cols self.groups = groups self.path_save_res = path_save_res self.ls_meas = get_names_of_measurements() self.ls_struct = get_names_of_structures() self.res_ttest = self.compute_ttest_for_col(p_thresh) self.tab = Table() def compute_ttest_for_col(self, p_thresh): res_4df = {'features': [], 'ttest': [], 'welch': []} res = dict() for col in self.ls_cols: group1 = self.df[self.df[self.group_col] == self.groups[0]][col] group2 = self.df[self.df[self.group_col] == self.groups[1]][col] ttest_eq_pop_var = stats.ttest_ind(group1, group2, equal_var=True) ttest_welch = stats.ttest_ind(group1, group2, equal_var=False) if ttest_eq_pop_var[1] < p_thresh: meas, struct = get_structure_measurement( col, self.ls_meas, self.ls_struct) #print('{:<15} {}'.format(meas, struct)) res[col] = { '{}, mean'.format(self.groups[0]): stats.tmean(group1), '{}, std'.format(self.groups[1]): stats.tstd(group2), '{}, mean'.format(self.groups[1]): stats.tmean(group2), '{}, std'.format(self.groups[1]): stats.tstd(group2), 'ttest': ttest_eq_pop_var[1], 'welch': ttest_welch[1], 'kurtosis': stats.kurtosis(self.df[self.group_col]), 'skewness': stats.skew(self.df[self.group_col]) } res_4df['features'].append(struct + ' (' + meas + ')') res_4df['ttest'].append(ttest_eq_pop_var[1]) res_4df['welch'].append(ttest_welch[1]) self.save_res(res_4df) return res def save_res(self, res_4df): df_result = self.tab.create_df_from_dict(res_4df) df_result.to_csv(os.path.join(self.path_save_res, 'ttest.csv'))
def save_df_Emmanuelle(df, groups, stats_dic, cols2color_sig, path2save, make_with_colors, extensions=('xlsx', 'csv', 'json')): if 'xlsx' in extensions: import openpyxl import string df.to_excel('stats_new.xlsx') ########## MERGE MEAN/STD SUB-INDEXES ################ file = openpyxl.load_workbook('stats_new.xlsx') sheet = file['Sheet1'] alpha = string.ascii_uppercase for ltr in range(len(alpha))[1:(2 * len(groups)) + 1:2]: cell1, cell2 = alpha[ltr] + str(2), alpha[ltr + 1] + str(2) sheet.merge_cells(str(cell1 + ':' + cell2)) file.save('stats_new.xlsx') if 'json' in extensions: utilities.save_json(stats_dic, os.path.join(path2save, 'stats.json')) if 'csv' in extensions: tab = Table() tab.save_df(df, os.path.join(path2save, 'stats_new.csv'), sheet_name='stats') if make_with_colors: save_2xlsx_with_colors_Emmanuelle(df, 'stats_new.xlsx', path2save, 'stats_wcolors.xlsx', cols2color_sig=cols2color_sig)
def feature_ranking(X_scaled, y_transform, cols_X): """ get the ranking of all features :param X_scaled: :param y_transform: :return: the pandas Dataframe of all ranking feature in a sorted way """ clf = RandomForestClassifier() feature_selector = RFE(clf) feature_selector.fit(X_scaled, y_transform) features_rfe_and_rank_df = Table().create_df(feature_selector.ranking_, index_col=cols_X, cols=['ranking']).sort_values( ['ranking']) # features_rfe_and_rank_df = pd.DataFrame(feature_selector.ranking_, # index=cols_X, columns=['ranking']).sort_values(['ranking']) features_rfe_and_rank_df['feature'] = features_rfe_and_rank_df.index return features_rfe_and_rank_df['feature'], features_rfe_and_rank_df
def mkstatisticsf(df_4stats, groups, group_col, path2save, make_with_colors=True): '''Creates discriptive statistical file for publication, based on provided pandas.DataFrame Works only on 2 groups Args: df_4stats: pandas.DataFrame group: list/ tuple of groups as str/int group_col: str() column name in df_4stats that has the group names from group path_2save: abspath to save the descrptive files make_with_colors: will create an additional .xlsx file with colored significant results, provided xlwt is installed Return: json file with results .csv file with results .xlsx file with results with red colored significant ones ''' tab = Table() ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch', 'ANOVA', 'Bartlett', 'MannWhitneyu', 'Kruskal') groups_df = dict() for group in groups: groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col, group) stats_dic = dict() vals2chk = df_4stats.columns.tolist() if group_col in vals2chk: vals2chk.remove(group_col) cols2color_sig = list() groups = list(groups_df.keys()) group1 = groups_df[groups[0]] group2 = groups_df[groups[1]] for test in ls_tests: for val in vals2chk: results, params = get_stats(test, group1[val], group2[val]) if test in ('mean', 'std', 'kurtosis', 'skewness'): key1 = f'{groups[0]}, {params[0]}' key2 = f'{groups[1]}, {params[0]}' else: key1 = f'{test}, {params[0]}' key2 = f'{test}, {params[1]}' cols2color_sig.append(key2) for key in (key1, key2): if key not in stats_dic: stats_dic[key] = dict() stats_dic[key1][val] = f'{results[0]}' stats_dic[key2][val] = f'{results[1]}' df = tab.create_df_from_dict(stats_dic) tab.save_df(df, os.path.join(path2save, 'stats_general.csv'), sheet_name='stats') utilities.save_json(stats_dic, os.path.join(path2save, 'stats_general.json')) if make_with_colors: save_2xlsx_with_colors(df, path2save=path2save, cols2color_sig=cols2color_sig)
class ClusterFile2CSV(): def __init__(self, file_abspath, result_abspath): from stats.db_processing import Table self.contrasts = fs_definitions.GLMcontrasts['contrasts'] self.get_explanations() self.col_4constrasts = "Contrast" self.header = ("ClusterNo", "Max", "VtxMax", "Size(mm^2)", "TalX", "TalY", "TalZ", "CWP", "CWPLow", "CWPHi", "NVtxs", "WghtVtx", "Annot", self.col_4constrasts, "Explanation") self.length_matrix = len(self.header) self.content = open(file_abspath, 'r').readlines() self.result_abspath = result_abspath self.tab = Table() self.ls_vals_2chk = self.contrasts.keys() self.run() def run(self): d = dict() i = 0 while i < len(self.content): line = self.content[i].replace('\n','') if self.chk_if_vals_in_line(line): expl = self.content[i+1].replace('\n','').replace(';','.') d[i] = ['','','','','','','','','','','','','', line, expl,] i += 2 else: line = self.clean_nans_from_list(line.split(' ')) i += 1 if len(line) != 0: d[i] = line + ['',''] self.save_2table(d) def save_2table(self, d): df = self.tab.create_df_from_dict(d).T column_names = {i[0]:i[1] for i in list(zip(df.columns, self.header))} df = df.rename(columns = column_names) df = df.set_index(df[self.col_4constrasts]) df = df.drop(columns = [self.col_4constrasts]) self.tab.save_df(df, self.result_abspath) def chk_if_vals_in_line(self, line): '''will use each value from self.ls_vals_2chk if present in the line: will return True and break else: return False ''' exists = False for val_2chk in self.ls_vals_2chk: if val_2chk in line: exists = True break return exists def clean_nans_from_list(self, ls): for i in ls[::-1]: if i == '': ls.remove(i) return ls def get_explanations(self): self.explanations = list() for key in self.contrasts: for file_name in self.contrasts[key]: self.explanations.append(self.contrasts[key][file_name][1])
class ANOVA_do(): def __init__(self, df, params_y, ls_cols4anova, path2save, p_thresh=0.05, intercept_thresh=0.05, print_not_FS=False): self.df = df self.params_y = params_y self.ls_cols4anova = ls_cols4anova self.sig_cols = dict() self.tab = Table() self.print_not_FS = print_not_FS self.fs_struc_meas = fs_definitions.GetFSStructureMeasurement() self.run_anova(p_thresh, intercept_thresh, path2save) def run_anova(self, p_thresh, intercept_thresh, path2save): ls_err = list() for param_y in self.params_y: x = np.array(self.df[param_y]) df_result = self.tab.get_clean_df() df_result_list = df_result.copy() df_result[param_y] = '' df_result_list[param_y] = '' ix = 1 ixx = 1 # print(f' analysing {len(self.ls_cols4anova)} features for parameter: {param_y}') for col in self.ls_cols4anova: y = np.array(self.df[col]) data_tmp = pd.DataFrame({'x': x, col: y}) model = ols(col + " ~ x", data=data_tmp).fit() if model.pvalues.Intercept < p_thresh and model.pvalues.x < intercept_thresh: measurement, structure, ls_err = self.fs_struc_meas.get( col, ls_err) if param_y not in self.sig_cols: self.sig_cols[param_y] = dict() self.sig_cols[param_y][col] = { 'rsquared': model.rsquared, 'rsquared-adjusted': model.rsquared_adj, 'F-statistic': model.fvalue, 'AIC': model.aic, 'BIC': model.bic, 'pvalue_slope': model.pvalues.x, 'pvalue_intercept': model.pvalues.Intercept, 'tvalue_slope': model.tvalues.x, 'tvalue_intercept': model.tvalues.Intercept, 'meas': measurement, 'struct': structure } df_result_list = self.populate_df( df_result_list, ixx, { param_y: structure, 'measure': measurement, 'pvalue': '%.4f' % model.pvalues.x }) if structure not in df_result[param_y].tolist(): df_result = self.populate_df( df_result, ix, { param_y: structure, measurement: '%.4f' % model.pvalues.x }) ix += 1 else: df_result = self.populate_df( df_result, df_result[param_y].tolist().index(structure), {measurement: '%.4f' % model.pvalues.x}) ixx += 1 self.tab.save_df_tocsv( df_result_list, path.join(path2save, f'anova_per_significance_{param_y}.csv')) self.tab.save_df_tocsv( df_result, path.join(path2save, f'anova_per_structure_{param_y}.csv')) save_json(self.sig_cols, path.join(path2save, f'anova_significant_features.json')) if self.print_not_FS: print('NOT freesurfer structures: ', ls_err) def populate_df(self, df, idx, cols_vals): for col in cols_vals: df.at[idx, col] = cols_vals[col] return df
class CheckIfReady4GLM(): def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed, f_GLM_group, FS_GLM_dir): self.proj_vars = proj_vars self.vars_fs = fs_vars self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR'] self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED'] self.f_ids_processed = f_ids_processed self.f_GLM_group = f_GLM_group self.FS_GLM_dir = FS_GLM_dir self.archive_type = '.zip' self.tab = Table() self.miss = dict() self.ids_4fs_glm = dict() self.df = self.tab.get_df(self.f_GLM_group) self.bids_ids = self.df[self.proj_vars['id_col']].tolist() self.ids_exclude_glm = os.path.join(self.FS_GLM_dir, 'excluded_from_glm.json') def chk_if_subjects_ready(self): fs_proc_ids = self.get_ids_processed() miss_bids_ids = [ i for i in self.bids_ids if i not in fs_proc_ids.keys() ] if miss_bids_ids: print( f' {len(miss_bids_ids)} IDs are missing from file: {self.f_ids_processed}' ) print(f' first 5 IDs are: {self.f_ids_processed[:5]}') for bids_id in miss_bids_ids: self.add_to_miss(bids_id, 'id_missing') if len(miss_bids_ids) < len(fs_proc_ids.keys()): for bids_id in [ i for i in self.bids_ids if i not in miss_bids_ids ]: fs_proc_id = fs_proc_ids[bids_id].replace( self.archive_type, '') if os.path.exists(os.path.join(self.FS_SUBJECTS_DIR, bids_id)): self.ids_4fs_glm[bids_id] = bids_id self.chk_glm_files(bids_id) elif os.path.exists( os.path.join(self.FS_SUBJECTS_DIR, fs_proc_id)): self.ids_4fs_glm[bids_id] = fs_proc_id self.chk_glm_files(fs_proc_id) else: print(f'id {bids_id} or freesurfer id {fs_proc_id} \ are missing from the {self.FS_SUBJECTS_DIR} folder') self.add_to_miss(bids_id, 'id_missing') if self.miss.keys(): print(" missing files and ids: ", self.miss) save_json(self.miss, self.ids_exclude_glm, print_space=8) subjs_missing = len(self.miss.keys()) subjs_present = len(self.ids_4fs_glm.keys()) print(f' Number of participants ready for FreeSurfer GLM:') print(f' in the folder: {self.FS_SUBJECTS_DIR}') print(f' {subjs_present} present') print(f' {subjs_missing} missing') not_ready = [ i for i in self.miss if "id_missing" not in self.miss[i] ] maybe_archived = [i for i in self.miss if i not in not_ready] if maybe_archived: print(" MAYBE archived: ", maybe_archived) q = " EXCEPTION! Some IDs are missing, but they could be archived.\n\ Do you want to do glm analysis with current subjects (y) or try to check the archive (n) ? (y/n)\n\ (note: if you answer NO, you will be asked to unarchive the \n\ processed folders of IDs if they are present in FREESURFER_PROCESSED)" if get_yes_no(q) == 1: self.create_fs_glm_df() return True, list() else: return False, maybe_archived if not_ready: print( " MISSING FILES: these participant CANNOT be included in the GLM analysis: ", not_ready) q = " EXCEPTION! Some IDs have missing files and they MUST be excluded from analysis.\n\ Do you want to continue without excluded IDs ? (y/n)" if get_yes_no(q) == 1: self.create_fs_glm_df() return True, list() else: return False, not_ready else: self.create_fs_glm_df() return True, list() else: print(' no ids found') return False, list() def chk_glm_files(self, bids_id): '''it is expected that the BIDS IDs are located in FREESURFER -> SUBJECTS_DIR script checks if subjects are present Args: bids_id: ID of the subject to chk Return: populates list of missing subjects populates dict with ids ''' files_not_ok = fs_definitions.ChkFSQcache(self.FS_SUBJECTS_DIR, bids_id, self.vars_fs).miss if files_not_ok: for file in files_not_ok[bids_id]: self.add_to_miss(bids_id, file) return False else: return True def create_fs_glm_df(self): self.rm_missing_ids() tmp_id = 'fs_id' print(' creating the glm file for FreeSurfer GLM analysis') d_ids = { self.proj_vars['id_col']: [i for i in list(self.ids_4fs_glm.keys())], tmp_id: [i for i in list(self.ids_4fs_glm.values())] } fs_proc_df = self.tab.create_df_from_dict(d_ids) fs_proc_df = self.tab.change_index(fs_proc_df, self.proj_vars['id_col']) grid_fs_df_pre = self.tab.change_index(self.df, self.proj_vars['id_col']) self.df_ids = self.tab.join_dfs(grid_fs_df_pre, fs_proc_df, how='outer') self.df_ids.rename(columns={tmp_id: self.proj_vars['id_col']}, inplace=True) self.df_ids = self.tab.change_index(self.df_ids, self.proj_vars['id_col']) self.tab.save_df(self.df_ids, self.f_GLM_group) PrepareForGLM(self.FS_SUBJECTS_DIR, self.FS_GLM_dir, self.f_GLM_group, self.proj_vars, self.vars_fs) def rm_missing_ids(self): ls_ix_2rm = list() for ix in self.df.index: bids_id = self.df.at[ix, self.proj_vars['id_col']] if bids_id not in self.ids_4fs_glm.keys(): ls_ix_2rm.append(ix) len_miss = len(ls_ix_2rm) if len_miss == 0: print(f' ALL subjects are present') else: print(f' {len_miss} subjects are missing') print(f' they will be removed from futher analysis') self.df = self.df.drop(ls_ix_2rm) def get_ids_processed(self): '''retrieves the bids names of the IDs provided in the GLM file. It is expected that each project had a group of subjects that are present in the dataset it is expected that BIDS names are the ones used in the groups_glm file for the ids the f_ids.json has the BIDS names of the subjects, and for each BIDS name has the corresponding names of the source file/freesurfer/nilearn/dipy processed ziped files see nimb/example/f_ids.json ''' print(' extracting list of ids that were processed with FreeSurfer') print(f' in the file{self.f_ids_processed}') self.ids_bids_proc_all = self.read_json(self.f_ids_processed) return { i: self.ids_bids_proc_all[i][DEFAULT.freesurfer_key] for i in self.ids_bids_proc_all } # return {i: 'path' for i in self.ids_bids_proc_all if self.ids_bids_proc_all[i]['source'] in ids_src_glm_file} #old version def add_to_miss(self, bids_id, file): '''add to the list of missing subjects ''' if bids_id not in self.miss: self.miss[bids_id] = list() self.miss[bids_id].append(file) if bids_id in self.ids_4fs_glm: self.ids_4fs_glm.pop(bids_id, None) def read_json(self, f): '''read a json file ''' with open(f, 'r') as jf: return json.load(jf)
def mkstatisticsf_Emmanuelle( df_4stats, groups, group_col, ): '''Creates discriptive statistical file for publication, based on provided pandas.DataFrame Works only on 2 groups author: version adjusted by Emmanuelle Mazur-Lainé 202206 Args: df_4stats: pandas.DataFrame group: list/ tuple of groups as str/int group_col: column name in df_4stats that has the group names from group path_2save: abspath to save the descriptive files make_with_colors: will create an additional .xlsx file with colored significant results, provided xlwt is installed Return: json file with results .csv file with results .xlsx file with results with red colored significant ones ''' df_4stats = df_4stats.astype(float) tab = Table() groups_df = dict() for group in groups: groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col, group) stats_dic = dict() vals2chk = df_4stats.columns.tolist() if group_col in vals2chk: vals2chk.remove(group_col) cols2color_sig = list() groups = list(groups_df.keys()) ################################ if len(groups) == 1: ls_tests = ('mean', 'std', 'kurtosis', 'skewness') elif len(groups) <= 2: ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch', 'MannWhitneyu') elif len(groups) > 2: ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'Bartlett', 'Kruskal', 'ANOVA') for test in ls_tests: for val in vals2chk: values_per_gr = [] for i in range(0, len(groups)): gr_i = groups_df[groups[i]][val].values arr = np.array(gr_i) arr_without_nan = arr[np.logical_not(np.isnan(arr))] values_per_gr.append(arr_without_nan) results, params = get_stats_Emmanuelle(test, groups, values_per_gr) if test == 'mean': for i in range(len(groups)): for tst in ('mean', 'std'): results, params = get_stats_Emmanuelle( tst, groups, values_per_gr) key = f'{groups[i]}, {params}' if key not in stats_dic: stats_dic[key] = dict() stats_dic[key][val] = f'{results[i]}' if test in ('kurtosis', 'skewness'): for i in range(len(groups)): key = f'{groups[i]}, {params}' if key not in stats_dic: stats_dic[key] = dict() stats_dic[key][val] = f'{results[i]}' elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal', 'ANOVA'): for i in range(len(groups)): key1 = f'{test}, {params[0]}' key2 = f'{test}, {params[1]}' for key in (key1, key2): if key not in stats_dic: stats_dic[key] = dict() stats_dic[key1][val] = f'{results[0]}' stats_dic[key2][val] = f'{results[1]}' cols2color_sig.append(key2) df = tab.create_df_from_dict(stats_dic) df = df.astype(float) # Creating new adjusted DataFrame with sub-indexes ls_tests_dup = [] ls_param = [] ls_keys = list(stats_dic.keys()) mean_gr_done = False for test in ls_tests: if test in ('mean', 'std'): for i in range(0, len(groups)): ls_tests_dup.append('mean/std') if mean_gr_done == False: for i in range(len(groups)): ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' + ')') ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' + ')') mean_gr_done = True elif test in ('kurtosis', 'skewness'): for i in range(0, len(groups)): ls_tests_dup.append(test) ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' + ')') elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal', 'ANOVA'): ls_tests_dup.append(test) ls_tests_dup.append(test) for key in ls_keys[4 * (len(groups)):]: ls_param.append((str(key))[-1]) col = [ls_tests_dup, ls_param] tuples = list(zip(*col)) df_new = pd.DataFrame(df.values, index=pd.Index(df.index), columns=pd.MultiIndex.from_tuples(tuples)) df_new = df_new.round(3) return df_new, stats_dic, cols2color_sig
class RUN_stats(): """will run statistical analysis for the provided groups file""" def __init__(self, all_vars): self.project = all_vars.params.project self.project_vars = all_vars.projects[self.project] self.stats_paths = self.project_vars['STATS_PATHS'] self.stats_params = self.project_vars['STATS_PARAMS'] self.group_col = self.project_vars['group_col'] self.dir_stats_home = self.stats_paths["STATS_HOME"] self.atlas = ('DK', 'DS', 'DKDS')[1] self.get_steps(all_vars) print( f' Performing statistical analysis in folder: {self.dir_stats_home}' ) print(' materials located at: {:<50}'.format( self.project_vars['materials_DIR'][1])) print(' file for analysis: {:<50}'.format( self.project_vars['fname_groups'])) print(' id column: {:<50}'.format(str(self.project_vars['id_col']))) print(' group column: {:<50}'.format( str(self.project_vars['group_col']))) # print(' variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm']))) self.tab = Table() self.preproc = preprocessing.Preprocess() self.df_user_stats, self.df_final_grid,\ self.df_adjusted,\ self.cols_X,\ self.groups = MakeGrid(self.project_vars).grid() def run(self): print("running") for step in self.steps: step2run = self.steps[step]['name'] if self.steps[step]["run"]: print(f" running step: {step2run}") self.run_step(step2run) def run_step(self, step2run): self.use_features = False self.feature_algo = 'PCA' #'RFE' for group in [ 'all', ] + self.groups: #'all' stands for all groups df_X, y_labeled, X_scaled, df_clin_group = self.get_X_data_per_group_all_groups( group) df_with_features, features, features_rfe_and_rank_df = self.get_features_df_per_group( group, X_scaled, y_labeled, df_X) if group == 'all': self.params_y = self.project_vars['variables_for_glm'] # STEP run general stats if step2run == "STEP_stats_ttest": from stats.stats_stats import ttest_do variables = self.params_y + df_X.columns.tolist() dir_2save = varia.get_dir( path.join(self.dir_stats_home, group)) ttest_res = ttest_do(self.tab.join_dfs( df_clin_group, df_X), self.group_col, variables, self.groups, dir_2save, p_thresh=0.05).res_ttest # STEP run ANOVA and Simple Linear Regression if step2run == "STEP_Anova": from stats.stats_models import ANOVA_do print('performing ANOVA') sig_cols = self.run_anova(features, 0.05, 0.05) if step2run == "STEP_SimpLinReg": print('performing Simple Linear Regression on all columns') from stats.plotting import Make_Plot_Regression, Make_plot_group_difference dir_2save = varia.get_dir( self.stats_paths['simp_lin_reg_dir']) param_features = self.run_anova(features, 1.0, 1.0) Make_Plot_Regression(self.df_final_grid, param_features, self.group_col, dir_2save) dir_2save = varia.get_dir(self.stats_paths['anova']) Make_plot_group_difference(self.df_final_grid, param_features, self.group_col, self.groups, dir_2save) # from stats.stats_groups_anova import RUN_GroupAnalysis_ANOVA_SimpleLinearRegression # dir_2save = varia.get_dir(path.join(self.dir_stats_home, # self.stats_paths['anova']+"_"+group)) # RUN_GroupAnalysis_ANOVA_SimpleLinearRegression(self.df_final_grid, # groups, # self.params_y, # self.project_vars['other_params'], # dir_2save, # self.group_col, # features) # STEP run ANOVA and Simple Logistic Regression if step2run == "STEP_LogisticRegression": from stats import stats_LogisticRegression print('performing Logistic Regression for all groups') dir_2save = varia.get_dir( path.join( self.dir_stats_home, self.stats_paths['logistic_regression_dir'] + "_" + group)) stats_LogisticRegression.Logistic_Regression( X_scaled, y_labeled, self.group_col, dir_2save) # STEP run Prediction RF SKF if step2run == "STEP_Predict_RF_SKF": print(' performing RF SKF Prediction for all groups') df_X_scaled = self.tab.create_df(X_scaled, index_col=range( X_scaled.shape[0]), cols=self.cols_X) accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm( features, df_X_scaled[features].values, y_labeled) print( " prediction accuracy computed with RF and SKF based on PCA features is: ", accuracy) # accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm( # features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled) # print("prediction accuracy computed with RF and SKF based on RFE features is: ",accuracy) # STEP run Prediction RF LOO if step2run == "STEP_Predict_RF_LOO": print( 'performing RF Leave-One_out Prediction for all groups' ) df_X_scaled = self.tab.create_df(X_scaled, index_col=range( X_scaled.shape[0]), cols=self.cols_X) accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm( features, df_X_scaled[features].values, y_labeled) print( " prediction accuracy computed with RF and SKF based on PCA features is: ", accuracy) accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm( features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled) print( " prediction accuracy computed with RF and SKF based on RFE features is: ", accuracy) else: # run Descriptive Statistics dir_2save = varia.get_dir( path.join(self.dir_stats_home, 'description')) self.run_descriptive_stats(df_clin_group, features, dir_2save) # STEP run Linear Regression Moderation if step2run == "STEP_LinRegModeration": from stats import stats_models print('performing Linear Regression Moderation analysis') stats_models.linreg_moderation_results( self.df_final_grid, features, self.project_vars['group_param'], self.project_vars['regression_param'], varia.get_dir( path.join( self.dir_stats_home, self.stats_paths['linreg_moderation_dir'])), group) # STEP run Laterality if step2run == "STEP_Laterality": from processing.atlases.atlas_definitions import RReplace from stats import stats_laterality print('performing Laterality analysis') lhrh_feat_d = RReplace(features).contralateral_features lhrh_features_list = [i for i in lhrh_feat_d.keys()] + [ v for v in lhrh_feat_d.values() ] df_with_features_lhrh = self.tab.get_df_from_df( df_X, usecols=sorted(lhrh_features_list)) stats_laterality.LateralityAnalysis( df_with_features_lhrh, lhrh_feat_d, group, varia.get_dir( path.join( self.dir_stats_home, self.stats_paths['laterality_dir']))).run() def run_descriptive_stats(self, df_clin_group, features, dir_2save): print('running descriptive statistics') def run_anova(self, features, p_thresh, intercept_thresh): from stats.stats_models import ANOVA_do dir_2save = varia.get_dir(self.stats_paths['anova']) return ANOVA_do(self.df_final_grid, self.params_y, features, dir_2save, p_thresh=p_thresh, intercept_thresh=intercept_thresh).sig_cols def get_X_data_per_group_all_groups(self, group): # extract X_scaled values for the brain parameters predicted_target = self.project_vars["prediction_target"] print(f" predicted target column is: {predicted_target}") if not predicted_target: predicted_target = self.group_col if group == 'all': df_clin_group = self.df_user_stats df_X = self.df_adjusted y_labeled = preprocessing.label_y(self.df_user_stats, predicted_target) X_scaled = preprocessing.scale_X(df_X) else: df_group = self.tab.get_df_per_parameter(self.df_final_grid, self.group_col, group) df_clin_group = self.tab.rm_cols_from_df(df_group, self.cols_X) df_X = self.tab.rm_cols_from_df( df_group, [i for i in df_group.columns.tolist() if i not in self.cols_X]) y_labeled = preprocessing.label_y(df_group, predicted_target) X_scaled = preprocessing.scale_X(df_X) return df_X, y_labeled, X_scaled, df_clin_group def log(self): stats = predict.get_stats_df( len(cols_X), atlas, self.stats_params["prediction_vars"]['nr_threads'], definitions.sys.platform, time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) def get_features_df_per_group(self, group, X_scaled, y_labeled, df_X): features_rfe_and_rank_df = 'none' if self.use_features: if self.feature_algo == 'PCA': # using PCA dir_2save = varia.get_dir( path.join(self.dir_stats_home, self.stats_paths['features'])) pca_threshold = self.stats_params["prediction_vars"][ 'pca_threshold'] features = predict.get_features_based_on_pca( dir_2save, pca_threshold, X_scaled, self.cols_X, group, self.atlas) elif self.feature_algo == 'RFE': # using RFE features, features_rfe_and_rank_df = predict.feature_ranking( X_scaled, y_labeled, self.cols_X) print(" number of features extracted by RFE: ", len(features_rfe_and_rank_df.feature)) df_with_features = self.tab.get_df_from_df(df_X, usecols=features) else: df_with_features = self.tab.get_df_from_df(df_X, usecols=self.cols_X) features = self.cols_X return df_with_features, features, features_rfe_and_rank_df def get_steps(self, all_vars): self.steps = { "groups": { "name": "STEP0_make_groups", "run": False }, "ttest": { "name": "STEP_stats_ttest", "run": False }, "anova": { "name": "STEP_Anova", "run": False }, "simplinreg": { "name": "STEP_SimpLinReg", "run": False }, "logreg": { "name": "STEP_LogisticRegression", "run": False }, "predskf": { "name": "STEP_Predict_RF_SKF", "run": False }, "predloo": { "name": "STEP_Predict_RF_LOO", "run": False }, "linregmod": { "name": "STEP_LinRegModeration", "run": False }, "laterality": { "name": "STEP_Laterality", "run": False }, } if all_vars.params.step == 00: for i in ("groups", "ttest", "anova", "simplinreg", "logreg", "predskf", "predloo", "linregmod", "laterality"): self.steps[i]["run"] = True else: self.steps[all_vars.params.step]["run"] = True