Exemple #1
0
def mkstatisticsf_Emmanuelle(
    df_4stats,
    groups,
    group_col,
):
    '''Creates discriptive statistical file for publication,
        based on provided pandas.DataFrame
        Works only on 2 groups
        author: version adjusted by Emmanuelle Mazur-Lainé 202206
    Args: df_4stats: pandas.DataFrame
        group: list/ tuple of groups as str/int
        group_col: column name in df_4stats that has the group names from group
        path_2save: abspath to save the descriptive files
        make_with_colors: will create an additional .xlsx file with 
                        colored significant results,
                        provided xlwt is installed
    Return:
        json file with results
        .csv file with results
        .xlsx file with results with red colored significant ones
    '''
    df_4stats = df_4stats.astype(float)

    tab = Table()

    groups_df = dict()
    for group in groups:
        groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col,
                                                    group)

    stats_dic = dict()
    vals2chk = df_4stats.columns.tolist()

    if group_col in vals2chk:
        vals2chk.remove(group_col)

    cols2color_sig = list()
    groups = list(groups_df.keys())

    ################################
    if len(groups) == 1:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness')
    elif len(groups) <= 2:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch',
                    'MannWhitneyu')
    elif len(groups) > 2:
        ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'Bartlett',
                    'Kruskal', 'ANOVA')

    for test in ls_tests:
        for val in vals2chk:
            values_per_gr = []
            for i in range(0, len(groups)):
                gr_i = groups_df[groups[i]][val].values
                arr = np.array(gr_i)
                arr_without_nan = arr[np.logical_not(np.isnan(arr))]
                values_per_gr.append(arr_without_nan)

            results, params = get_stats_Emmanuelle(test, groups, values_per_gr)

            if test == 'mean':
                for i in range(len(groups)):
                    for tst in ('mean', 'std'):
                        results, params = get_stats_Emmanuelle(
                            tst, groups, values_per_gr)
                        key = f'{groups[i]}, {params}'
                        if key not in stats_dic:
                            stats_dic[key] = dict()
                        stats_dic[key][val] = f'{results[i]}'

            if test in ('kurtosis', 'skewness'):
                for i in range(len(groups)):
                    key = f'{groups[i]}, {params}'
                    if key not in stats_dic:
                        stats_dic[key] = dict()
                    stats_dic[key][val] = f'{results[i]}'

            elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu',
                          'Kruskal', 'ANOVA'):
                for i in range(len(groups)):
                    key1 = f'{test}, {params[0]}'
                    key2 = f'{test}, {params[1]}'
                    for key in (key1, key2):
                        if key not in stats_dic:
                            stats_dic[key] = dict()
                    stats_dic[key1][val] = f'{results[0]}'
                    stats_dic[key2][val] = f'{results[1]}'

                cols2color_sig.append(key2)

    df = tab.create_df_from_dict(stats_dic)
    df = df.astype(float)

    # Creating new adjusted DataFrame with sub-indexes
    ls_tests_dup = []
    ls_param = []
    ls_keys = list(stats_dic.keys())

    mean_gr_done = False

    for test in ls_tests:
        if test in ('mean', 'std'):
            for i in range(0, len(groups)):
                ls_tests_dup.append('mean/std')
            if mean_gr_done == False:
                for i in range(len(groups)):
                    ls_param.append('gr' + str(i + 1) + ' (val=' +
                                    f'{groups[i]}' + ')')
                    ls_param.append('gr' + str(i + 1) + ' (val=' +
                                    f'{groups[i]}' + ')')
                    mean_gr_done = True

        elif test in ('kurtosis', 'skewness'):
            for i in range(0, len(groups)):
                ls_tests_dup.append(test)
                ls_param.append('gr' + str(i + 1) + ' (val=' + f'{groups[i]}' +
                                ')')

        elif test in ('TTest', 'Welch', 'Bartlett', 'MannWhitneyu', 'Kruskal',
                      'ANOVA'):
            ls_tests_dup.append(test)
            ls_tests_dup.append(test)

    for key in ls_keys[4 * (len(groups)):]:
        ls_param.append((str(key))[-1])

    col = [ls_tests_dup, ls_param]
    tuples = list(zip(*col))

    df_new = pd.DataFrame(df.values,
                          index=pd.Index(df.index),
                          columns=pd.MultiIndex.from_tuples(tuples))

    df_new = df_new.round(3)

    return df_new, stats_dic, cols2color_sig
Exemple #2
0
class RUN_stats():
    """will run statistical analysis for the provided groups file"""
    def __init__(self, all_vars):
        self.project = all_vars.params.project
        self.project_vars = all_vars.projects[self.project]
        self.stats_paths = self.project_vars['STATS_PATHS']
        self.stats_params = self.project_vars['STATS_PARAMS']
        self.group_col = self.project_vars['group_col']
        self.dir_stats_home = self.stats_paths["STATS_HOME"]
        self.atlas = ('DK', 'DS', 'DKDS')[1]
        self.get_steps(all_vars)

        print(
            f'    Performing statistical analysis in folder: {self.dir_stats_home}'
        )
        print('    materials located at: {:<50}'.format(
            self.project_vars['materials_DIR'][1]))
        print('    file for analysis: {:<50}'.format(
            self.project_vars['fname_groups']))
        print('    id column: {:<50}'.format(str(self.project_vars['id_col'])))
        print('    group column: {:<50}'.format(
            str(self.project_vars['group_col'])))
        # print('    variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm'])))

        self.tab = Table()
        self.preproc = preprocessing.Preprocess()
        self.df_user_stats, self.df_final_grid,\
            self.df_adjusted,\
            self.cols_X,\
            self.groups = MakeGrid(self.project_vars).grid()

    def run(self):
        print("running")
        for step in self.steps:
            step2run = self.steps[step]['name']
            if self.steps[step]["run"]:
                print(f"    running step: {step2run}")
                self.run_step(step2run)

    def run_step(self, step2run):
        self.use_features = False
        self.feature_algo = 'PCA'  #'RFE'

        for group in [
                'all',
        ] + self.groups:  #'all' stands for all groups
            df_X, y_labeled, X_scaled, df_clin_group = self.get_X_data_per_group_all_groups(
                group)
            df_with_features, features, features_rfe_and_rank_df = self.get_features_df_per_group(
                group, X_scaled, y_labeled, df_X)

            if group == 'all':
                self.params_y = self.project_vars['variables_for_glm']

                # STEP run general stats
                if step2run == "STEP_stats_ttest":
                    from stats.stats_stats import ttest_do

                    variables = self.params_y + df_X.columns.tolist()
                    dir_2save = varia.get_dir(
                        path.join(self.dir_stats_home, group))
                    ttest_res = ttest_do(self.tab.join_dfs(
                        df_clin_group, df_X),
                                         self.group_col,
                                         variables,
                                         self.groups,
                                         dir_2save,
                                         p_thresh=0.05).res_ttest

                # STEP run ANOVA and Simple Linear Regression
                if step2run == "STEP_Anova":
                    from stats.stats_models import ANOVA_do
                    print('performing ANOVA')
                    sig_cols = self.run_anova(features, 0.05, 0.05)

                if step2run == "STEP_SimpLinReg":
                    print('performing Simple Linear Regression on all columns')
                    from stats.plotting import Make_Plot_Regression, Make_plot_group_difference
                    dir_2save = varia.get_dir(
                        self.stats_paths['simp_lin_reg_dir'])
                    param_features = self.run_anova(features, 1.0, 1.0)
                    Make_Plot_Regression(self.df_final_grid, param_features,
                                         self.group_col, dir_2save)
                    dir_2save = varia.get_dir(self.stats_paths['anova'])
                    Make_plot_group_difference(self.df_final_grid,
                                               param_features, self.group_col,
                                               self.groups, dir_2save)

                    # from stats.stats_groups_anova import RUN_GroupAnalysis_ANOVA_SimpleLinearRegression
                    # dir_2save = varia.get_dir(path.join(self.dir_stats_home,
                    #                                     self.stats_paths['anova']+"_"+group))
                    # RUN_GroupAnalysis_ANOVA_SimpleLinearRegression(self.df_final_grid,
                    #                                         groups,
                    #                                         self.params_y,
                    #                                         self.project_vars['other_params'],
                    #                                         dir_2save,
                    #                                         self.group_col,
                    #                                         features)

                # STEP run ANOVA and Simple Logistic Regression
                if step2run == "STEP_LogisticRegression":
                    from stats import stats_LogisticRegression
                    print('performing Logistic Regression for all groups')
                    dir_2save = varia.get_dir(
                        path.join(
                            self.dir_stats_home,
                            self.stats_paths['logistic_regression_dir'] + "_" +
                            group))
                    stats_LogisticRegression.Logistic_Regression(
                        X_scaled, y_labeled, self.group_col, dir_2save)

                # STEP run Prediction RF SKF
                if step2run == "STEP_Predict_RF_SKF":
                    print('    performing RF SKF Prediction for all groups')
                    df_X_scaled = self.tab.create_df(X_scaled,
                                                     index_col=range(
                                                         X_scaled.shape[0]),
                                                     cols=self.cols_X)
                    accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm(
                        features, df_X_scaled[features].values, y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on PCA features is: ",
                        accuracy)
                    # accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm(
                    #         features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled)
                    # print("prediction accuracy computed with RF and SKF based on RFE features is: ",accuracy)

                # STEP run Prediction RF LOO
                if step2run == "STEP_Predict_RF_LOO":
                    print(
                        'performing RF Leave-One_out Prediction for all groups'
                    )
                    df_X_scaled = self.tab.create_df(X_scaled,
                                                     index_col=range(
                                                         X_scaled.shape[0]),
                                                     cols=self.cols_X)
                    accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm(
                        features, df_X_scaled[features].values, y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on PCA features is: ",
                        accuracy)
                    accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm(
                        features_rfe_and_rank_df.feature,
                        df_X_scaled[features_rfe_and_rank_df.feature].values,
                        y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on RFE features is: ",
                        accuracy)

            else:
                # run Descriptive Statistics
                dir_2save = varia.get_dir(
                    path.join(self.dir_stats_home, 'description'))
                self.run_descriptive_stats(df_clin_group, features, dir_2save)

                # STEP run Linear Regression Moderation
                if step2run == "STEP_LinRegModeration":
                    from stats import stats_models
                    print('performing Linear Regression Moderation analysis')
                    stats_models.linreg_moderation_results(
                        self.df_final_grid, features,
                        self.project_vars['group_param'],
                        self.project_vars['regression_param'],
                        varia.get_dir(
                            path.join(
                                self.dir_stats_home,
                                self.stats_paths['linreg_moderation_dir'])),
                        group)

                # STEP run Laterality
                if step2run == "STEP_Laterality":
                    from processing.atlases.atlas_definitions import RReplace
                    from stats import stats_laterality
                    print('performing Laterality analysis')
                    lhrh_feat_d = RReplace(features).contralateral_features
                    lhrh_features_list = [i for i in lhrh_feat_d.keys()] + [
                        v for v in lhrh_feat_d.values()
                    ]
                    df_with_features_lhrh = self.tab.get_df_from_df(
                        df_X, usecols=sorted(lhrh_features_list))
                    stats_laterality.LateralityAnalysis(
                        df_with_features_lhrh, lhrh_feat_d, group,
                        varia.get_dir(
                            path.join(
                                self.dir_stats_home,
                                self.stats_paths['laterality_dir']))).run()

    def run_descriptive_stats(self, df_clin_group, features, dir_2save):
        print('running descriptive statistics')

    def run_anova(self, features, p_thresh, intercept_thresh):
        from stats.stats_models import ANOVA_do
        dir_2save = varia.get_dir(self.stats_paths['anova'])
        return ANOVA_do(self.df_final_grid,
                        self.params_y,
                        features,
                        dir_2save,
                        p_thresh=p_thresh,
                        intercept_thresh=intercept_thresh).sig_cols

    def get_X_data_per_group_all_groups(self, group):
        # extract X_scaled values for the brain parameters
        predicted_target = self.project_vars["prediction_target"]
        print(f"    predicted target column is: {predicted_target}")
        if not predicted_target:
            predicted_target = self.group_col
        if group == 'all':
            df_clin_group = self.df_user_stats
            df_X = self.df_adjusted
            y_labeled = preprocessing.label_y(self.df_user_stats,
                                              predicted_target)
            X_scaled = preprocessing.scale_X(df_X)
        else:
            df_group = self.tab.get_df_per_parameter(self.df_final_grid,
                                                     self.group_col, group)
            df_clin_group = self.tab.rm_cols_from_df(df_group, self.cols_X)
            df_X = self.tab.rm_cols_from_df(
                df_group,
                [i for i in df_group.columns.tolist() if i not in self.cols_X])
            y_labeled = preprocessing.label_y(df_group, predicted_target)
            X_scaled = preprocessing.scale_X(df_X)
        return df_X, y_labeled, X_scaled, df_clin_group

    def log(self):
        stats = predict.get_stats_df(
            len(cols_X), atlas,
            self.stats_params["prediction_vars"]['nr_threads'],
            definitions.sys.platform,
            time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))

    def get_features_df_per_group(self, group, X_scaled, y_labeled, df_X):
        features_rfe_and_rank_df = 'none'
        if self.use_features:
            if self.feature_algo == 'PCA':  # using PCA
                dir_2save = varia.get_dir(
                    path.join(self.dir_stats_home,
                              self.stats_paths['features']))
                pca_threshold = self.stats_params["prediction_vars"][
                    'pca_threshold']
                features = predict.get_features_based_on_pca(
                    dir_2save, pca_threshold, X_scaled, self.cols_X, group,
                    self.atlas)
            elif self.feature_algo == 'RFE':  # using RFE
                features, features_rfe_and_rank_df = predict.feature_ranking(
                    X_scaled, y_labeled, self.cols_X)
                print("    number of features extracted by RFE: ",
                      len(features_rfe_and_rank_df.feature))
            df_with_features = self.tab.get_df_from_df(df_X, usecols=features)
        else:
            df_with_features = self.tab.get_df_from_df(df_X,
                                                       usecols=self.cols_X)
            features = self.cols_X
        return df_with_features, features, features_rfe_and_rank_df

    def get_steps(self, all_vars):
        self.steps = {
            "groups": {
                "name": "STEP0_make_groups",
                "run": False
            },
            "ttest": {
                "name": "STEP_stats_ttest",
                "run": False
            },
            "anova": {
                "name": "STEP_Anova",
                "run": False
            },
            "simplinreg": {
                "name": "STEP_SimpLinReg",
                "run": False
            },
            "logreg": {
                "name": "STEP_LogisticRegression",
                "run": False
            },
            "predskf": {
                "name": "STEP_Predict_RF_SKF",
                "run": False
            },
            "predloo": {
                "name": "STEP_Predict_RF_LOO",
                "run": False
            },
            "linregmod": {
                "name": "STEP_LinRegModeration",
                "run": False
            },
            "laterality": {
                "name": "STEP_Laterality",
                "run": False
            },
        }
        if all_vars.params.step == 00:
            for i in ("groups", "ttest", "anova", "simplinreg", "logreg",
                      "predskf", "predloo", "linregmod", "laterality"):
                self.steps[i]["run"] = True
        else:
            self.steps[all_vars.params.step]["run"] = True
Exemple #3
0
def mkstatisticsf(df_4stats,
                  groups,
                  group_col,
                  path2save,
                  make_with_colors=True):
    '''Creates discriptive statistical file for publication,
        based on provided pandas.DataFrame
        Works only on 2 groups
    Args: df_4stats: pandas.DataFrame
        group: list/ tuple of groups as str/int
        group_col: str() column name in df_4stats that has the group names from group
        path_2save: abspath to save the descrptive files
        make_with_colors: will create an additional .xlsx file with 
                        colored significant results,
                        provided xlwt is installed
    Return:
        json file with results
        .csv file with results
        .xlsx file with results with red colored significant ones
    '''

    tab = Table()
    ls_tests = ('mean', 'std', 'kurtosis', 'skewness', 'TTest', 'Welch',
                'ANOVA', 'Bartlett', 'MannWhitneyu', 'Kruskal')

    groups_df = dict()
    for group in groups:
        groups_df[group] = tab.get_df_per_parameter(df_4stats, group_col,
                                                    group)

    stats_dic = dict()
    vals2chk = df_4stats.columns.tolist()
    if group_col in vals2chk:
        vals2chk.remove(group_col)

    cols2color_sig = list()
    groups = list(groups_df.keys())
    group1 = groups_df[groups[0]]
    group2 = groups_df[groups[1]]
    for test in ls_tests:
        for val in vals2chk:
            results, params = get_stats(test, group1[val], group2[val])
            if test in ('mean', 'std', 'kurtosis', 'skewness'):
                key1 = f'{groups[0]}, {params[0]}'
                key2 = f'{groups[1]}, {params[0]}'
            else:
                key1 = f'{test}, {params[0]}'
                key2 = f'{test}, {params[1]}'
                cols2color_sig.append(key2)
            for key in (key1, key2):
                if key not in stats_dic:
                    stats_dic[key] = dict()
            stats_dic[key1][val] = f'{results[0]}'
            stats_dic[key2][val] = f'{results[1]}'

    df = tab.create_df_from_dict(stats_dic)
    tab.save_df(df,
                os.path.join(path2save, 'stats_general.csv'),
                sheet_name='stats')
    utilities.save_json(stats_dic, os.path.join(path2save,
                                                'stats_general.json'))
    if make_with_colors:
        save_2xlsx_with_colors(df,
                               path2save=path2save,
                               cols2color_sig=cols2color_sig)