def get_stats_df(len_df_X, atlas, nr_threads, env_name, time_started): ''' script to save the parameters that are used for each specific analysis ''' import sklearn import matplotlib stats = Table().get_clean_df() d = { 'pandas version': Table().pd_ver, 'numpy version': np.__version__, 'matplotlib version': matplotlib.__version__, 'sklearn version': sklearn.__version__, 'number of iterations': definitions.prediction_defs['NUM_ITER'], 'atlas': atlas, 'nr of features': len_df_X, 'nr of threads': nr_threads, 'remote name': env_name, 'analysis started at': time_started, } i = 0 for key in d: stats.at[i, 'stats'] = key stats.at[i, 'values'] = d[key] i += 1 return stats
def __init__(self, all_vars): self.project = all_vars.params.project self.project_vars = all_vars.projects[self.project] self.stats_paths = self.project_vars['STATS_PATHS'] self.stats_params = self.project_vars['STATS_PARAMS'] self.group_col = self.project_vars['group_col'] self.dir_stats_home = self.stats_paths["STATS_HOME"] self.atlas = ('DK', 'DS', 'DKDS')[1] self.get_steps(all_vars) print( f' Performing statistical analysis in folder: {self.dir_stats_home}' ) print(' materials located at: {:<50}'.format( self.project_vars['materials_DIR'][1])) print(' file for analysis: {:<50}'.format( self.project_vars['fname_groups'])) print(' id column: {:<50}'.format(str(self.project_vars['id_col']))) print(' group column: {:<50}'.format( str(self.project_vars['group_col']))) # print(' variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm']))) self.tab = Table() self.preproc = preprocessing.Preprocess() self.df_user_stats, self.df_final_grid,\ self.df_adjusted,\ self.cols_X,\ self.groups = MakeGrid(self.project_vars).grid()
def linreg_moderation_results(df_X_linreg, ls_cols_X_atlas, group_param, regression_param, path_dir_save_results, group): '''perform moderation analysis Args: df_X_linreg: pandas.DataFrame with columns and data for analysis ls_cols_X_atlas: ls of cols from df_X_linreg that will be used for linear regression analysis group_param: str, name of the column from df_X_linreg that was used to create the groups regression_param: str, name of the column from df_X_linreg that will be used for regression analysis (e.g., Age) path_dir_save: abspath to save csv file group: group name to use for the results csv file Return: none creates csv file ''' d_result = compute_linreg_data(df_X_linreg, ls_cols_X_atlas, group_param, regression_param) df_result = Table().create_df_from_dict(d_result) Table().save_df( df_result, path.join(path_dir_save_results, f'linreg_moderation_{group}.csv'))
def __init__(self, df, params_y, ls_cols4anova, path2save, p_thresh=0.05, intercept_thresh=0.05, print_not_FS=False): self.df = df self.params_y = params_y self.ls_cols4anova = ls_cols4anova self.sig_cols = dict() self.tab = Table() self.print_not_FS = print_not_FS self.fs_struc_meas = fs_definitions.GetFSStructureMeasurement() self.run_anova(p_thresh, intercept_thresh, path2save)
def __init__(self, df, group_col, ls_cols, groups, path_save_res, p_thresh=0.05): self.df = df self.group_col = group_col self.ls_cols = ls_cols self.groups = groups self.path_save_res = path_save_res self.ls_meas = get_names_of_measurements() self.ls_struct = get_names_of_structures() self.res_ttest = self.compute_ttest_for_col(p_thresh) self.tab = Table()
def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed, f_GLM_group, FS_GLM_dir): self.proj_vars = proj_vars self.vars_fs = fs_vars self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR'] self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED'] self.f_ids_processed = f_ids_processed self.f_GLM_group = f_GLM_group self.FS_GLM_dir = FS_GLM_dir self.archive_type = '.zip' self.tab = Table() self.miss = dict() self.ids_4fs_glm = dict() self.df = self.tab.get_df(self.f_GLM_group) self.bids_ids = self.df[self.proj_vars['id_col']].tolist() self.ids_exclude_glm = os.path.join(self.FS_GLM_dir, 'excluded_from_glm.json')
def __init__(self, file_abspath, result_abspath): from stats.db_processing import Table self.contrasts = fs_definitions.GLMcontrasts['contrasts'] self.get_explanations() self.col_4constrasts = "Contrast" self.header = ("ClusterNo", "Max", "VtxMax", "Size(mm^2)", "TalX", "TalY", "TalZ", "CWP", "CWPLow", "CWPHi", "NVtxs", "WghtVtx", "Annot", self.col_4constrasts, "Explanation") self.length_matrix = len(self.header) self.content = open(file_abspath, 'r').readlines() self.result_abspath = result_abspath self.tab = Table() self.ls_vals_2chk = self.contrasts.keys() self.run()
def feature_ranking(X_scaled, y_transform, cols_X): """ get the ranking of all features :param X_scaled: :param y_transform: :return: the pandas Dataframe of all ranking feature in a sorted way """ clf = RandomForestClassifier() feature_selector = RFE(clf) feature_selector.fit(X_scaled, y_transform) features_rfe_and_rank_df = Table().create_df(feature_selector.ranking_, index_col=cols_X, cols=['ranking']).sort_values( ['ranking']) # features_rfe_and_rank_df = pd.DataFrame(feature_selector.ranking_, # index=cols_X, columns=['ranking']).sort_values(['ranking']) features_rfe_and_rank_df['feature'] = features_rfe_and_rank_df.index return features_rfe_and_rank_df['feature'], features_rfe_and_rank_df
def get_groups_and_variables(self, proj_vars, GLM_file_group, vars_fs): """creating working variables and dictionaries """ cols_2use = proj_vars["variables_for_glm"] + [ self.id_col, self.group_col ] df_groups_clin = Table().get_df_with_columns(GLM_file_group, cols_2use) self.ls_groups = pd.unique(df_groups_clin[self.group_col]).tolist() self.ids = self.get_ids_ready4glm(df_groups_clin[self.id_col].tolist(), vars_fs) d_init = df_groups_clin.to_dict() self.d_subjid = {} self.ls_vars_stats = [key for key in d_init if key != self.id_col] for rownr in d_init[self.id_col]: _id = d_init[self.id_col][rownr] if _id in self.ids: self.d_subjid[_id] = {} for var in self.ls_vars_stats: self.d_subjid[_id][var] = d_init[var][rownr] self.ls_vars_stats.remove(self.group_col) self.make_subjects_per_group(df_groups_clin)
def save_features(dic_feat_comps, expl_variance, file2save, img2save, lang="EN"): """features extracted from PCA are being saved to a table and image Args: dic_feat_comps = {feature_name: explained_variance} expl_variance = PCA explained_variance_ file2save = abspath to the csv file to save the table img2save = abspath to the image png file to save the table lang = language used to describe the results """ df_feat_comps = Table().create_df(dic_feat_comps.values(), index_col=dic_feat_comps.keys(), cols=['explained_variance']) df_feat_comps.to_csv(file2save) plotting.plot_simple(vals=np.cumsum(expl_variance), xlabel=params_lang[lang]['nr_components'], ylabel=params_lang[lang]['expl_cum_var'], path_to_save_file=img2save)
def save_df_Emmanuelle(df, groups, stats_dic, cols2color_sig, path2save, make_with_colors, extensions=('xlsx', 'csv', 'json')): if 'xlsx' in extensions: import openpyxl import string df.to_excel('stats_new.xlsx') ########## MERGE MEAN/STD SUB-INDEXES ################ file = openpyxl.load_workbook('stats_new.xlsx') sheet = file['Sheet1'] alpha = string.ascii_uppercase for ltr in range(len(alpha))[1:(2 * len(groups)) + 1:2]: cell1, cell2 = alpha[ltr] + str(2), alpha[ltr + 1] + str(2) sheet.merge_cells(str(cell1 + ':' + cell2)) file.save('stats_new.xlsx') if 'json' in extensions: utilities.save_json(stats_dic, os.path.join(path2save, 'stats.json')) if 'csv' in extensions: tab = Table() tab.save_df(df, os.path.join(path2save, 'stats_new.csv'), sheet_name='stats') if make_with_colors: save_2xlsx_with_colors_Emmanuelle(df, 'stats_new.xlsx', path2save, 'stats_wcolors.xlsx', cols2color_sig=cols2color_sig)
def mkstatisticsf_Emmanuelle( df_4stats, groups, group_col, ): '''Creates discriptive statistical file for publication, based on provided pandas.DataFrame Works only on 2 groups author: version adjusted by Emmanuelle Mazur-Lainé 202206 Args: df_4stats: pandas.DataFrame group: list/ tuple of groups as str/int group_col: column name in df_4stats that has the group names from group path_2save: abspath to save the descriptive files make_with_colors: will create an additional .xlsx file with colored significant results, provided xlwt is installed Return: json file with results .csv file with results .xlsx file with results with red colored significant ones ''' df_4stats = df_4stats.astype(float) tab = Table() groups_df = dict() for group in groups: groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col, group) stats_dic = dict() vals2chk = df_4stats.columns.tolist() if group_col in vals2chk: vals2chk.remove(group_col) cols2color_sig = list() groups = list(groups_df.keys()) ################################ if len(groups) == 1: ls_tests = ('mean', 'std', 'kurtosis', 'skewness') elif len(groups) <= 2: ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch', 'MannWhitneyu') elif len(groups) > 2: ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'Bartlett', 'Kruskal', 'ANOVA') for test in ls_tests: for val in vals2chk: values_per_gr = [] for i in range(0, len(groups)): gr_i = groups_df[groups[i]][val].values arr = np.array(gr_i) arr_without_nan = arr[np.logical_not(np.isnan(arr))] values_per_gr.append(arr_without_nan) results, params = get_stats_Emmanuelle(test, groups, values_per_gr) if test == 'mean': for i in range(len(groups)): for tst in ('mean', 'std'): results, params = get_stats_Emmanuelle( tst, groups, values_per_gr) key = f'{groups[i]}, {params}' if key not in stats_dic: stats_dic[key] = dict() stats_dic[key][val] = f'{results[i]}' if test in ('kurtosis', 'skewness'): for i in range(len(groups)): key = f'{groups[i]}, {params}' if key not in stats_dic: stats_dic[key] = dict() stats_dic[key][val] = f'{results[i]}' elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal', 'ANOVA'): for i in range(len(groups)): key1 = f'{test}, {params[0]}' key2 = f'{test}, {params[1]}' for key in (key1, key2): if key not in stats_dic: stats_dic[key] = dict() stats_dic[key1][val] = f'{results[0]}' stats_dic[key2][val] = f'{results[1]}' cols2color_sig.append(key2) df = tab.create_df_from_dict(stats_dic) df = df.astype(float) # Creating new adjusted DataFrame with sub-indexes ls_tests_dup = [] ls_param = [] ls_keys = list(stats_dic.keys()) mean_gr_done = False for test in ls_tests: if test in ('mean', 'std'): for i in range(0, len(groups)): ls_tests_dup.append('mean/std') if mean_gr_done == False: for i in range(len(groups)): ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' + ')') ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' + ')') mean_gr_done = True elif test in ('kurtosis', 'skewness'): for i in range(0, len(groups)): ls_tests_dup.append(test) ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' + ')') elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal', 'ANOVA'): ls_tests_dup.append(test) ls_tests_dup.append(test) for key in ls_keys[4 * (len(groups)):]: ls_param.append((str(key))[-1]) col = [ls_tests_dup, ls_param] tuples = list(zip(*col)) df_new = pd.DataFrame(df.values, index=pd.Index(df.index), columns=pd.MultiIndex.from_tuples(tuples)) df_new = df_new.round(3) return df_new, stats_dic, cols2color_sig
def mkstatisticsf(df_4stats, groups, group_col, path2save, make_with_colors=True): '''Creates discriptive statistical file for publication, based on provided pandas.DataFrame Works only on 2 groups Args: df_4stats: pandas.DataFrame group: list/ tuple of groups as str/int group_col: str() column name in df_4stats that has the group names from group path_2save: abspath to save the descrptive files make_with_colors: will create an additional .xlsx file with colored significant results, provided xlwt is installed Return: json file with results .csv file with results .xlsx file with results with red colored significant ones ''' tab = Table() ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch', 'ANOVA', 'Bartlett', 'MannWhitneyu', 'Kruskal') groups_df = dict() for group in groups: groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col, group) stats_dic = dict() vals2chk = df_4stats.columns.tolist() if group_col in vals2chk: vals2chk.remove(group_col) cols2color_sig = list() groups = list(groups_df.keys()) group1 = groups_df[groups[0]] group2 = groups_df[groups[1]] for test in ls_tests: for val in vals2chk: results, params = get_stats(test, group1[val], group2[val]) if test in ('mean', 'std', 'kurtosis', 'skewness'): key1 = f'{groups[0]}, {params[0]}' key2 = f'{groups[1]}, {params[0]}' else: key1 = f'{test}, {params[0]}' key2 = f'{test}, {params[1]}' cols2color_sig.append(key2) for key in (key1, key2): if key not in stats_dic: stats_dic[key] = dict() stats_dic[key1][val] = f'{results[0]}' stats_dic[key2][val] = f'{results[1]}' df = tab.create_df_from_dict(stats_dic) tab.save_df(df, os.path.join(path2save, 'stats_general.csv'), sheet_name='stats') utilities.save_json(stats_dic, os.path.join(path2save, 'stats_general.json')) if make_with_colors: save_2xlsx_with_colors(df, path2save=path2save, cols2color_sig=cols2color_sig)