Exemple #1
0
class CheckIfReady4GLM():
    def __init__(self, nimb_vars, fs_vars, proj_vars, f_ids_processed,
                 f_GLM_group, FS_GLM_dir):
        self.proj_vars = proj_vars
        self.vars_fs = fs_vars
        self.FS_SUBJECTS_DIR = fs_vars['SUBJECTS_DIR']
        self.NIMB_PROCESSED_FS = fs_vars['NIMB_PROCESSED']
        self.f_ids_processed = f_ids_processed
        self.f_GLM_group = f_GLM_group
        self.FS_GLM_dir = FS_GLM_dir
        self.archive_type = '.zip'
        self.tab = Table()
        self.miss = dict()
        self.ids_4fs_glm = dict()
        self.df = self.tab.get_df(self.f_GLM_group)
        self.bids_ids = self.df[self.proj_vars['id_col']].tolist()
        self.ids_exclude_glm = os.path.join(self.FS_GLM_dir,
                                            'excluded_from_glm.json')

    def chk_if_subjects_ready(self):

        fs_proc_ids = self.get_ids_processed()
        miss_bids_ids = [
            i for i in self.bids_ids if i not in fs_proc_ids.keys()
        ]
        if miss_bids_ids:
            print(
                f'    {len(miss_bids_ids)} IDs are missing from file: {self.f_ids_processed}'
            )
            print(f'        first 5 IDs are: {self.f_ids_processed[:5]}')
            for bids_id in miss_bids_ids:
                self.add_to_miss(bids_id, 'id_missing')

        if len(miss_bids_ids) < len(fs_proc_ids.keys()):
            for bids_id in [
                    i for i in self.bids_ids if i not in miss_bids_ids
            ]:
                fs_proc_id = fs_proc_ids[bids_id].replace(
                    self.archive_type, '')
                if os.path.exists(os.path.join(self.FS_SUBJECTS_DIR, bids_id)):
                    self.ids_4fs_glm[bids_id] = bids_id
                    self.chk_glm_files(bids_id)
                elif os.path.exists(
                        os.path.join(self.FS_SUBJECTS_DIR, fs_proc_id)):
                    self.ids_4fs_glm[bids_id] = fs_proc_id
                    self.chk_glm_files(fs_proc_id)
                else:
                    print(f'id {bids_id} or freesurfer id {fs_proc_id} \
                        are missing from the {self.FS_SUBJECTS_DIR} folder')
                    self.add_to_miss(bids_id, 'id_missing')
            if self.miss.keys():
                print("    missing files and ids: ", self.miss)
                save_json(self.miss, self.ids_exclude_glm, print_space=8)
                subjs_missing = len(self.miss.keys())
                subjs_present = len(self.ids_4fs_glm.keys())
                print(f'    Number of participants ready for FreeSurfer GLM:')
                print(f'        in the folder: {self.FS_SUBJECTS_DIR}')
                print(f'        {subjs_present} present')
                print(f'        {subjs_missing} missing')
                not_ready = [
                    i for i in self.miss if "id_missing" not in self.miss[i]
                ]
                maybe_archived = [i for i in self.miss if i not in not_ready]
                if maybe_archived:
                    print("   MAYBE archived: ", maybe_archived)
                    q = "    EXCEPTION! Some IDs are missing, but they could be archived.\n\
                    Do you want to do glm analysis with current subjects (y) or try to check the archive (n) ? (y/n)\n\
                        (note: if you answer NO, you will be asked to unarchive the \n\
                        processed folders of IDs if they are present in FREESURFER_PROCESSED)"

                    if get_yes_no(q) == 1:
                        self.create_fs_glm_df()
                        return True, list()
                    else:
                        return False, maybe_archived
                if not_ready:
                    print(
                        "    MISSING FILES: these participant CANNOT be included in the GLM analysis: ",
                        not_ready)
                    q = "    EXCEPTION! Some IDs have missing files and they MUST be excluded from analysis.\n\
                    Do you want to continue without excluded IDs ? (y/n)"

                    if get_yes_no(q) == 1:
                        self.create_fs_glm_df()
                        return True, list()
                    else:
                        return False, not_ready
            else:
                self.create_fs_glm_df()
                return True, list()
        else:
            print('    no ids found')
            return False, list()

    def chk_glm_files(self, bids_id):
        '''it is expected that the BIDS IDs are located in FREESURFER -> SUBJECTS_DIR
            script checks if subjects are present
        Args:
            bids_id: ID of the subject to chk
        Return:
            populates list of missing subjects
            populates dict with ids
        '''
        files_not_ok = fs_definitions.ChkFSQcache(self.FS_SUBJECTS_DIR,
                                                  bids_id, self.vars_fs).miss
        if files_not_ok:
            for file in files_not_ok[bids_id]:
                self.add_to_miss(bids_id, file)
            return False
        else:
            return True

    def create_fs_glm_df(self):
        self.rm_missing_ids()
        tmp_id = 'fs_id'
        print('    creating the glm file for FreeSurfer GLM analysis')
        d_ids = {
            self.proj_vars['id_col']:
            [i for i in list(self.ids_4fs_glm.keys())],
            tmp_id: [i for i in list(self.ids_4fs_glm.values())]
        }
        fs_proc_df = self.tab.create_df_from_dict(d_ids)
        fs_proc_df = self.tab.change_index(fs_proc_df,
                                           self.proj_vars['id_col'])
        grid_fs_df_pre = self.tab.change_index(self.df,
                                               self.proj_vars['id_col'])
        self.df_ids = self.tab.join_dfs(grid_fs_df_pre,
                                        fs_proc_df,
                                        how='outer')
        self.df_ids.rename(columns={tmp_id: self.proj_vars['id_col']},
                           inplace=True)
        self.df_ids = self.tab.change_index(self.df_ids,
                                            self.proj_vars['id_col'])
        self.tab.save_df(self.df_ids, self.f_GLM_group)
        PrepareForGLM(self.FS_SUBJECTS_DIR, self.FS_GLM_dir, self.f_GLM_group,
                      self.proj_vars, self.vars_fs)

    def rm_missing_ids(self):
        ls_ix_2rm = list()
        for ix in self.df.index:
            bids_id = self.df.at[ix, self.proj_vars['id_col']]
            if bids_id not in self.ids_4fs_glm.keys():
                ls_ix_2rm.append(ix)
        len_miss = len(ls_ix_2rm)
        if len_miss == 0:
            print(f'        ALL subjects are present')
        else:
            print(f'        {len_miss} subjects are missing')
            print(f'            they will be removed from futher analysis')
        self.df = self.df.drop(ls_ix_2rm)

    def get_ids_processed(self):
        '''retrieves the bids names of the IDs provided in the GLM file.
            It is expected that each project had a group of subjects that are present in the dataset
            it is expected that BIDS names are the ones used in the groups_glm file for the ids
            the f_ids.json has the BIDS names of the subjects, and for each BIDS name
            has the corresponding names of the source file/freesurfer/nilearn/dipy processed ziped files
            see nimb/example/f_ids.json
        '''
        print('    extracting list of ids that were processed with FreeSurfer')
        print(f'        in the file{self.f_ids_processed}')
        self.ids_bids_proc_all = self.read_json(self.f_ids_processed)
        return {
            i: self.ids_bids_proc_all[i][DEFAULT.freesurfer_key]
            for i in self.ids_bids_proc_all
        }
        # return {i: 'path' for i in self.ids_bids_proc_all if self.ids_bids_proc_all[i]['source'] in ids_src_glm_file} #old version

    def add_to_miss(self, bids_id, file):
        '''add to the list of missing subjects
        '''
        if bids_id not in self.miss:
            self.miss[bids_id] = list()
        self.miss[bids_id].append(file)
        if bids_id in self.ids_4fs_glm:
            self.ids_4fs_glm.pop(bids_id, None)

    def read_json(self, f):
        '''read a json file
        '''
        with open(f, 'r') as jf:
            return json.load(jf)
Exemple #2
0
class RUN_stats():
    """will run statistical analysis for the provided groups file"""
    def __init__(self, all_vars):
        self.project = all_vars.params.project
        self.project_vars = all_vars.projects[self.project]
        self.stats_paths = self.project_vars['STATS_PATHS']
        self.stats_params = self.project_vars['STATS_PARAMS']
        self.group_col = self.project_vars['group_col']
        self.dir_stats_home = self.stats_paths["STATS_HOME"]
        self.atlas = ('DK', 'DS', 'DKDS')[1]
        self.get_steps(all_vars)

        print(
            f'    Performing statistical analysis in folder: {self.dir_stats_home}'
        )
        print('    materials located at: {:<50}'.format(
            self.project_vars['materials_DIR'][1]))
        print('    file for analysis: {:<50}'.format(
            self.project_vars['fname_groups']))
        print('    id column: {:<50}'.format(str(self.project_vars['id_col'])))
        print('    group column: {:<50}'.format(
            str(self.project_vars['group_col'])))
        # print('    variables to analyse: {:<50}'.format(str(self.project_vars['variables_for_glm'])))

        self.tab = Table()
        self.preproc = preprocessing.Preprocess()
        self.df_user_stats, self.df_final_grid,\
            self.df_adjusted,\
            self.cols_X,\
            self.groups = MakeGrid(self.project_vars).grid()

    def run(self):
        print("running")
        for step in self.steps:
            step2run = self.steps[step]['name']
            if self.steps[step]["run"]:
                print(f"    running step: {step2run}")
                self.run_step(step2run)

    def run_step(self, step2run):
        self.use_features = False
        self.feature_algo = 'PCA'  #'RFE'

        for group in [
                'all',
        ] + self.groups:  #'all' stands for all groups
            df_X, y_labeled, X_scaled, df_clin_group = self.get_X_data_per_group_all_groups(
                group)
            df_with_features, features, features_rfe_and_rank_df = self.get_features_df_per_group(
                group, X_scaled, y_labeled, df_X)

            if group == 'all':
                self.params_y = self.project_vars['variables_for_glm']

                # STEP run general stats
                if step2run == "STEP_stats_ttest":
                    from stats.stats_stats import ttest_do

                    variables = self.params_y + df_X.columns.tolist()
                    dir_2save = varia.get_dir(
                        path.join(self.dir_stats_home, group))
                    ttest_res = ttest_do(self.tab.join_dfs(
                        df_clin_group, df_X),
                                         self.group_col,
                                         variables,
                                         self.groups,
                                         dir_2save,
                                         p_thresh=0.05).res_ttest

                # STEP run ANOVA and Simple Linear Regression
                if step2run == "STEP_Anova":
                    from stats.stats_models import ANOVA_do
                    print('performing ANOVA')
                    sig_cols = self.run_anova(features, 0.05, 0.05)

                if step2run == "STEP_SimpLinReg":
                    print('performing Simple Linear Regression on all columns')
                    from stats.plotting import Make_Plot_Regression, Make_plot_group_difference
                    dir_2save = varia.get_dir(
                        self.stats_paths['simp_lin_reg_dir'])
                    param_features = self.run_anova(features, 1.0, 1.0)
                    Make_Plot_Regression(self.df_final_grid, param_features,
                                         self.group_col, dir_2save)
                    dir_2save = varia.get_dir(self.stats_paths['anova'])
                    Make_plot_group_difference(self.df_final_grid,
                                               param_features, self.group_col,
                                               self.groups, dir_2save)

                    # from stats.stats_groups_anova import RUN_GroupAnalysis_ANOVA_SimpleLinearRegression
                    # dir_2save = varia.get_dir(path.join(self.dir_stats_home,
                    #                                     self.stats_paths['anova']+"_"+group))
                    # RUN_GroupAnalysis_ANOVA_SimpleLinearRegression(self.df_final_grid,
                    #                                         groups,
                    #                                         self.params_y,
                    #                                         self.project_vars['other_params'],
                    #                                         dir_2save,
                    #                                         self.group_col,
                    #                                         features)

                # STEP run ANOVA and Simple Logistic Regression
                if step2run == "STEP_LogisticRegression":
                    from stats import stats_LogisticRegression
                    print('performing Logistic Regression for all groups')
                    dir_2save = varia.get_dir(
                        path.join(
                            self.dir_stats_home,
                            self.stats_paths['logistic_regression_dir'] + "_" +
                            group))
                    stats_LogisticRegression.Logistic_Regression(
                        X_scaled, y_labeled, self.group_col, dir_2save)

                # STEP run Prediction RF SKF
                if step2run == "STEP_Predict_RF_SKF":
                    print('    performing RF SKF Prediction for all groups')
                    df_X_scaled = self.tab.create_df(X_scaled,
                                                     index_col=range(
                                                         X_scaled.shape[0]),
                                                     cols=self.cols_X)
                    accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm(
                        features, df_X_scaled[features].values, y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on PCA features is: ",
                        accuracy)
                    # accuracy, best_estimator, average_score_list, _ = predict.SKF_algorithm(
                    #         features_rfe_and_rank_df.feature, df_X_scaled[features_rfe_and_rank_df.feature].values, y_labeled)
                    # print("prediction accuracy computed with RF and SKF based on RFE features is: ",accuracy)

                # STEP run Prediction RF LOO
                if step2run == "STEP_Predict_RF_LOO":
                    print(
                        'performing RF Leave-One_out Prediction for all groups'
                    )
                    df_X_scaled = self.tab.create_df(X_scaled,
                                                     index_col=range(
                                                         X_scaled.shape[0]),
                                                     cols=self.cols_X)
                    accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm(
                        features, df_X_scaled[features].values, y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on PCA features is: ",
                        accuracy)
                    accuracy, best_estimator, average_score_list, _ = predict.LOO_algorithm(
                        features_rfe_and_rank_df.feature,
                        df_X_scaled[features_rfe_and_rank_df.feature].values,
                        y_labeled)
                    print(
                        "    prediction accuracy computed with RF and SKF based on RFE features is: ",
                        accuracy)

            else:
                # run Descriptive Statistics
                dir_2save = varia.get_dir(
                    path.join(self.dir_stats_home, 'description'))
                self.run_descriptive_stats(df_clin_group, features, dir_2save)

                # STEP run Linear Regression Moderation
                if step2run == "STEP_LinRegModeration":
                    from stats import stats_models
                    print('performing Linear Regression Moderation analysis')
                    stats_models.linreg_moderation_results(
                        self.df_final_grid, features,
                        self.project_vars['group_param'],
                        self.project_vars['regression_param'],
                        varia.get_dir(
                            path.join(
                                self.dir_stats_home,
                                self.stats_paths['linreg_moderation_dir'])),
                        group)

                # STEP run Laterality
                if step2run == "STEP_Laterality":
                    from processing.atlases.atlas_definitions import RReplace
                    from stats import stats_laterality
                    print('performing Laterality analysis')
                    lhrh_feat_d = RReplace(features).contralateral_features
                    lhrh_features_list = [i for i in lhrh_feat_d.keys()] + [
                        v for v in lhrh_feat_d.values()
                    ]
                    df_with_features_lhrh = self.tab.get_df_from_df(
                        df_X, usecols=sorted(lhrh_features_list))
                    stats_laterality.LateralityAnalysis(
                        df_with_features_lhrh, lhrh_feat_d, group,
                        varia.get_dir(
                            path.join(
                                self.dir_stats_home,
                                self.stats_paths['laterality_dir']))).run()

    def run_descriptive_stats(self, df_clin_group, features, dir_2save):
        print('running descriptive statistics')

    def run_anova(self, features, p_thresh, intercept_thresh):
        from stats.stats_models import ANOVA_do
        dir_2save = varia.get_dir(self.stats_paths['anova'])
        return ANOVA_do(self.df_final_grid,
                        self.params_y,
                        features,
                        dir_2save,
                        p_thresh=p_thresh,
                        intercept_thresh=intercept_thresh).sig_cols

    def get_X_data_per_group_all_groups(self, group):
        # extract X_scaled values for the brain parameters
        predicted_target = self.project_vars["prediction_target"]
        print(f"    predicted target column is: {predicted_target}")
        if not predicted_target:
            predicted_target = self.group_col
        if group == 'all':
            df_clin_group = self.df_user_stats
            df_X = self.df_adjusted
            y_labeled = preprocessing.label_y(self.df_user_stats,
                                              predicted_target)
            X_scaled = preprocessing.scale_X(df_X)
        else:
            df_group = self.tab.get_df_per_parameter(self.df_final_grid,
                                                     self.group_col, group)
            df_clin_group = self.tab.rm_cols_from_df(df_group, self.cols_X)
            df_X = self.tab.rm_cols_from_df(
                df_group,
                [i for i in df_group.columns.tolist() if i not in self.cols_X])
            y_labeled = preprocessing.label_y(df_group, predicted_target)
            X_scaled = preprocessing.scale_X(df_X)
        return df_X, y_labeled, X_scaled, df_clin_group

    def log(self):
        stats = predict.get_stats_df(
            len(cols_X), atlas,
            self.stats_params["prediction_vars"]['nr_threads'],
            definitions.sys.platform,
            time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))

    def get_features_df_per_group(self, group, X_scaled, y_labeled, df_X):
        features_rfe_and_rank_df = 'none'
        if self.use_features:
            if self.feature_algo == 'PCA':  # using PCA
                dir_2save = varia.get_dir(
                    path.join(self.dir_stats_home,
                              self.stats_paths['features']))
                pca_threshold = self.stats_params["prediction_vars"][
                    'pca_threshold']
                features = predict.get_features_based_on_pca(
                    dir_2save, pca_threshold, X_scaled, self.cols_X, group,
                    self.atlas)
            elif self.feature_algo == 'RFE':  # using RFE
                features, features_rfe_and_rank_df = predict.feature_ranking(
                    X_scaled, y_labeled, self.cols_X)
                print("    number of features extracted by RFE: ",
                      len(features_rfe_and_rank_df.feature))
            df_with_features = self.tab.get_df_from_df(df_X, usecols=features)
        else:
            df_with_features = self.tab.get_df_from_df(df_X,
                                                       usecols=self.cols_X)
            features = self.cols_X
        return df_with_features, features, features_rfe_and_rank_df

    def get_steps(self, all_vars):
        self.steps = {
            "groups": {
                "name": "STEP0_make_groups",
                "run": False
            },
            "ttest": {
                "name": "STEP_stats_ttest",
                "run": False
            },
            "anova": {
                "name": "STEP_Anova",
                "run": False
            },
            "simplinreg": {
                "name": "STEP_SimpLinReg",
                "run": False
            },
            "logreg": {
                "name": "STEP_LogisticRegression",
                "run": False
            },
            "predskf": {
                "name": "STEP_Predict_RF_SKF",
                "run": False
            },
            "predloo": {
                "name": "STEP_Predict_RF_LOO",
                "run": False
            },
            "linregmod": {
                "name": "STEP_LinRegModeration",
                "run": False
            },
            "laterality": {
                "name": "STEP_Laterality",
                "run": False
            },
        }
        if all_vars.params.step == 00:
            for i in ("groups", "ttest", "anova", "simplinreg", "logreg",
                      "predskf", "predloo", "linregmod", "laterality"):
                self.steps[i]["run"] = True
        else:
            self.steps[all_vars.params.step]["run"] = True