def run(group_config_path): import re import subprocess subprocess.getoutput('source ~/.bashrc') import os import sys import pickle import yaml group_config_obj = load_config_yml(group_config_path) pipeline_output_folder = group_config_obj.pipeline_dir if not group_config_obj.participant_list == None: s_paths = group_config_obj.participant_list else: s_paths = [ x for x in os.listdir(pipeline_output_folder) if os.path.isdir(x) ] merged_file = randomise_merged_file(s_paths) out_file = randomise_merged_mask(s_paths) prep_randomise_workflow(group_config_obj, merged_file=merged_file, mask_file=out_file, working_dir=None, output_dir=None, crash_dir=None)
def load_subject_file(group_config_path): group_config_obj = load_config_yml(group_config_path) pipeline_output_folder = group_config_obj.pipeline_dir if not group_config_obj.participant_list == None: s_paths = group_config_obj.participant_list else: s_paths = [x for x in os.listdir(pipeline_output_folder) if os.path.isdir(x)]
def load_subject_file(group_config_path): group_config_obj = load_config_yml(group_config_path) pipeline_output_folder = group_config_obj.pipeline_dir if not group_config_obj.participant_list == None: s_paths = group_config_obj.participant_list else: s_paths = [ x for x in os.listdir(pipeline_output_folder) if os.path.isdir(x) ]
def build_feat_model(model_df, model_name, group_config_file, resource_id, preproc_strat, session_id, series_or_repeated_label): # # this function runs once per derivative type and preproc strat combo # during group analysis # import os import patsy import pandas as pd import numpy as np import nipype.pipeline.engine as pe import nipype.interfaces.utility as util import nipype.interfaces.io as nio from CPAC.pipeline.cpac_group_runner import load_config_yml from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv, \ write_blank_contrast_csv group_config_obj = load_config_yml(group_config_file) pipeline_ID = group_config_obj.pipeline_dir.rstrip('/').split('/')[-1] #sublist_txt = group_config_obj.participant_list #if sublist_txt == None: # print ("Warning! You have not provided a subject list. CPAC will use all the subjects in pipeline directory") # sublist_txt = group_config_obj.participant_list #else: # sublist_txt = group_config_obj.particpant_list # remove file names from preproc_strat filename = preproc_strat.split("/")[-1] preproc_strat = preproc_strat.replace('.nii', '').replace('.gz', '') preproc_strat = preproc_strat.lstrip("/").rstrip("/") ftest_list = [] readme_flags = [] # determine if f-tests are included or not custom_confile = group_config_obj.custom_contrasts if ((custom_confile is None) or (custom_confile == '') or ("None" in custom_confile) or ("none" in custom_confile)): custom_confile = None # if (len(group_config_obj.f_tests) == 0) or \ # (group_config_obj.f_tests is None): # fTest = False # else: # fTest = True # ftest_list = group_config_obj.f_tests #else: # if not os.path.exists(custom_confile): # errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \ # ".CSV file for your group model, but this file cannot " \ # "be found. Please double-check the filepath you have " \ # "entered.\n\nFilepath: %s\n\n" % custom_confile # raise Exception(errmsg)# # with open(custom_confile, "r") as f: # evs = f.readline() # evs = evs.rstrip('\r\n').split(',') # count_ftests = 0 # fTest = False # for ev in evs: # if "f_test" in ev: # count_ftests += 1 # create path for output directory model_dir = os.path.join(group_config_obj.output_dir, 'cpac_group_analysis', 'FSL_FEAT', '{0}'.format(pipeline_ID), 'group_model_{0}'.format(model_name)) out_dir = os.path.join(model_dir, resource_id, session_id, series_or_repeated_label, preproc_strat) try: preset_contrast = group_config_obj.preset preset = True except AttributeError: preset = False if 'sca_roi' in resource_id: out_dir = os.path.join(out_dir, re.search('sca_ROI_(\d)+', os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id: out_dir = os.path.join(out_dir, re.search('temp_reg_map_z_(\d)+', os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'centrality' in resource_id: names = [ 'degree_centrality_binarize', 'degree_centrality_weighted', 'eigenvector_centrality_binarize', 'eigenvector_centrality_weighted', 'lfcd_binarize', 'lfcd_weighted' ] for name in names: if name in filename: out_dir = os.path.join(out_dir, name) break if 'tempreg_maps' in resource_id: out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+', os.path.splitext(os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) model_path = os.path.join(out_dir, 'model_files') # create the actual directories create_dir(model_path, "group analysis output") # create new subject list based on which subjects are left after checking # for missing outputs new_participant_list = [] for part in model_df["participant_id"]: # do this instead of using "set" just in case, to preserve order # only reason there may be duplicates is because of multiple-series # repeated measures runs if part not in new_participant_list: new_participant_list.append(part) if group_config_obj.participant_list == None: #participant_list = os.listdir(group_config_obj.pipeline_dir) new_sub_file = write_new_sub_file(model_path, group_config_obj.pipeline_dir, new_participant_list) else: new_sub_file = write_new_sub_file(model_path, group_config_obj.participant_list, new_participant_list) group_config_obj.update('participant_list', new_sub_file) num_subjects = len(list(model_df["participant_id"])) # start processing the dataframe further design_formula = group_config_obj.design_formula # demean EVs set for demeaning for demean_EV in group_config_obj.ev_selections.get("demean", []): model_df[demean_EV] = model_df[demean_EV].astype(float) model_df[demean_EV] = model_df[demean_EV].sub( model_df[demean_EV].mean()) # demean the motion params if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula): params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"] for param in params: model_df[param] = model_df[param].astype(float) model_df[param] = model_df[param].sub(model_df[param].mean()) # create 4D merged copefile, in the correct order, identical to design # matrix merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz" merge_outfile = os.path.join(model_path, merge_outfile) merge_file = create_merged_copefile(model_df["Filepath"].tolist(), merge_outfile) # create merged group mask merge_mask_outfile = '_'.join( [model_name, resource_id, "merged_mask.nii.gz"]) merge_mask_outfile = os.path.join(model_path, merge_mask_outfile) merge_mask = create_merge_mask(merge_file, merge_mask_outfile) if "Group Mask" in group_config_obj.mean_mask: mask_for_means = merge_mask else: individual_masks_dir = os.path.join(model_path, "individual_masks") create_dir(individual_masks_dir, "individual masks") for unique_id, series_id, raw_filepath in zip( model_df["participant_id"], model_df["Series"], model_df["Raw_Filepath"]): mask_for_means_path = os.path.join( individual_masks_dir, "%s_%s_%s_mask.nii.gz" % (unique_id, series_id, resource_id)) mask_for_means = create_merge_mask(raw_filepath, mask_for_means_path) readme_flags.append("individual_masks") # calculate measure means, and demean if "Measure_Mean" in design_formula: model_df = calculate_measure_mean_in_df(model_df, mask_for_means) # calculate custom ROIs, and demean (in workflow?) if "Custom_ROI_Mean" in design_formula: custom_roi_mask = group_config_obj.custom_roi_mask if (custom_roi_mask == None) or (custom_roi_mask == "None") or \ (custom_roi_mask == "none") or (custom_roi_mask == ""): err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \ "formula, but you didn't supply a custom ROI mask file." \ "\n\nDesign formula: %s\n\n" % design_formula raise Exception(err) # make sure the custom ROI mask file is the same resolution as the # output files - if not, resample and warn the user roi_mask = check_mask_file_resolution( list(model_df["Raw_Filepath"])[0], custom_roi_mask, mask_for_means, model_path, resource_id) # trim the custom ROI mask to be within mask constraints output_mask = os.path.join(model_path, "masked_%s" \ % os.path.basename(roi_mask)) roi_mask = trim_mask(roi_mask, mask_for_means, output_mask) readme_flags.append("custom_roi_mask_trimmed") # calculate model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask) # update the design formula new_design_substring = "" for col in model_df.columns: if "Custom_ROI_Mean_" in str(col): if str(col) == "Custom_ROI_Mean_1": new_design_substring = new_design_substring + " %s" % col else: new_design_substring = new_design_substring + " + %s" % col design_formula = design_formula.replace("Custom_ROI_Mean", new_design_substring) cat_list = [] if "categorical" in group_config_obj.ev_selections.keys(): cat_list = group_config_obj.ev_selections["categorical"] # prep design for repeated measures, if applicable if len(group_config_obj.sessions_list) > 0: if "session" in model_df.columns: # if these columns were added by the model builder automatically design_formula = design_formula + " + session" if "session" not in cat_list: cat_list.append("session") if len(group_config_obj.series_list) > 0: design_formula = design_formula + " + Series" if "Series" not in cat_list: cat_list.append("Series") if "session" in model_df.columns: # if these columns were added by the model builder automatically for col in model_df.columns: # should only grab the repeated measures-designed participant_{ID} # columns, not the "participant_id" column! if "participant_" in col and "_id" not in col: design_formula = design_formula + " + %s" % col cat_list.append(col) # parse out the EVs in the design formula at this point in time # this is essentially a list of the EVs that are to be included ev_list = parse_out_covariates(design_formula) # SPLIT GROUPS here. # CURRENT PROBLEMS: was creating a few doubled-up new columns grp_vector = [1] * num_subjects if group_config_obj.group_sep: # check if the group_ev parameter is a list instead of a string: # this was added to handle the new group-level analysis presets. this # is the only modification that was required to the group analysis # workflow, and it handles cases where the group variances must be # modeled separately, by creating separate groups for the FSL FLAME # .grp file. # the group_ev parameter gets sent in as a list if coming from any # of the presets that deal with multiple groups- in these cases, # the pheno_df/design matrix is already set up properly for the # multiple groups, and we need to bypass all of the processing # that usually occurs when the "modeling group variances # separately" option is enabled in the group analysis config YAML group_ev = group_config_obj.grouping_var if isinstance(group_ev, list) or "," in group_ev: grp_vector = [] if "," in group_ev: group_ev = group_ev.split(",") if len(group_ev) == 2: for x, y in zip(model_df[group_ev[0]], model_df[group_ev[1]]): if x == 1: grp_vector.append(1) elif y == 1: grp_vector.append(2) else: err = "\n\n[!] The two categorical covariates you " \ "provided as the two separate groups (in order " \ "to model each group's variances separately) " \ "either have more than 2 levels (1/0), or are " \ "not encoded as 1's and 0's.\n\nCovariates:\n" \ "{0}\n{1}\n\n".format(group_ev[0], group_ev[1]) raise Exception(err) elif len(group_ev) == 3: for x, y, z in zip(model_df[group_ev[0]], model_df[group_ev[1]], model_df[group_ev[2]]): if x == 1: grp_vector.append(1) elif y == 1: grp_vector.append(2) elif z == 1: grp_vector.append(3) else: err = "\n\n[!] The three categorical covariates you " \ "provided as the three separate groups (in order " \ "to model each group's variances separately) " \ "either have more than 2 levels (1/0), or are " \ "not encoded as 1's and 0's.\n\nCovariates:\n" \ "{0}\n{1}\n{2}\n\n".format(group_ev[0], group_ev[1], group_ev[2]) raise Exception(err) else: # we're only going to see this if someone plays around with # their preset or config file manually err = "\n\n[!] If you are seeing this message, it's because:\n" \ "1. You are using the group-level analysis presets\n" \ "2. You are running a model with multiple groups having " \ "their variances modeled separately (i.e. multiple " \ "values in the FSL FLAME .grp input file), and\n" \ "3. For some reason, the configuration has been set up " \ "in a way where CPAC currently thinks you're including " \ "only one group, or more than three, neither of which " \ "are supported.\n\nGroups provided:\n{0}" \ "\n\n".format(str(group_ev)) raise Exception(err) else: # model group variances separately old_ev_list = ev_list model_df, grp_vector, ev_list, cat_list = split_groups( model_df, group_config_obj.grouping_var, ev_list, cat_list) # make the grouping variable categorical for Patsy (if we try to # do this automatically below, it will categorical-ize all of # the substrings too) design_formula = design_formula.replace( group_config_obj.grouping_var, "C(" + group_config_obj.grouping_var + ")") if group_config_obj.coding_scheme == "Sum": design_formula = design_formula.replace(")", ", Sum)") # update design formula rename = {} for old_ev in old_ev_list: for new_ev in ev_list: if old_ev + "__FOR" in new_ev: if old_ev not in rename.keys(): rename[old_ev] = [] rename[old_ev].append(new_ev) for old_ev in rename.keys(): design_formula = design_formula.replace( old_ev, " + ".join(rename[old_ev])) # prep design formula for Patsy design_formula = patsify_design_formula(design_formula, cat_list, group_config_obj.coding_scheme[0]) if not preset: # send to Patsy try: dmatrix = patsy.dmatrix(design_formula, model_df) dmatrix.design_info.column_names.append(model_df["Filepath"]) dmatrix_column_names = dmatrix.design_info.column_names except Exception as e: err = "\n\n[!] Something went wrong with processing the group model "\ "design matrix using the Python Patsy package. Patsy might " \ "not be properly installed, or there may be an issue with the "\ "formatting of the design matrix.\n\nDesign matrix columns: " \ "%s\n\nPatsy-formatted design formula: %s\n\nError details: " \ "%s\n\n" % (model_df.columns, design_formula, e) raise Exception(err) else: if 'Sessions' in model_df: sess_levels = list(set(list(model_df['Sessions'].values))) if len(sess_levels) > 1: sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'} if len(sess_levels) == 3: sess_map.update({sess_levels[2]: '0'}) new_sess = [ s.replace(s, sess_map[s]) for s in list(model_df['Sessions'].values) ] model_df['Sessions'] = new_sess if 'Series' in model_df: sess_levels = list(set(list(model_df['Series'].values))) if len(sess_levels) > 1: sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'} if len(sess_levels) == 3: sess_map.update({sess_levels[2]: '0'}) new_sess = [ s.replace(s, sess_map[s]) for s in list(model_df['Series'].values) ] model_df['Series'] = new_sess keep_cols = [x for x in model_df.columns if x in design_formula] dmatrix = model_df[keep_cols].astype('float') dmatrix_column_names = list(dmatrix.columns) # check the model for multicollinearity - Patsy takes care of this, but # just in case check_multicollinearity(np.array(dmatrix)) dmat_csv_path = os.path.join(model_path, "design_matrix.csv") contrast_out_path = os.path.join(out_dir, "contrast.csv") # make sure "column_names" is in the same order as the original EV column # header ordering in model_df - mainly for repeated measures, to make sure # participants_<ID> cols are at end for clarity for users dmat_cols = [] dmat_id_cols = [] for dmat_col in dmatrix_column_names: if 'participant_' in dmat_col: dmat_id_cols.append(dmat_col) else: dmat_cols.append(dmat_col) column_names = dmat_cols dmat_id_cols = sorted(dmat_id_cols) column_names += dmat_id_cols # check to make sure there are more time points than EVs! if len(column_names) >= num_subjects: err = "\n\n################## MODEL NOT GENERATED ##################" \ "\n\n[!] CPAC says: There are more EVs than there are " \ "participants currently included in the model for:\n\n" \ "Derivative: {0}\nSession: {1}\nScan: {2}\nPreproc strategy:" \ "\n {3}\n\n" \ "There must be more participants than EVs in the design.\n\n" \ "Number of participants: {4}\nNumber of EVs: {5}\n\nEV/" \ "covariate list: {6}\n\nNote: If you specified to model group " \ "variances separately, the amount of EVs can nearly double " \ "once they are split along the grouping variable.\n\nIf the " \ "number of participants is lower than the number of " \ "participants in your group analysis inclusion list, this " \ "may be because not every participant originally included has " \ "an output for {7} for this scan and preprocessing strategy in " \ "the individual-level analysis output directory.\n\nDesign " \ "formula going in: {8}" \ "\n\n#########################################################" \ "\n\n".format(resource_id, session_id, series_or_repeated_label, preproc_strat, num_subjects, len(column_names), column_names, resource_id, design_formula) print(err) # check the merged file's order check_merged_file(model_df["Filepath"], merge_file) # we must demean the categorical regressors if the Intercept/Grand Mean # is included in the model, otherwise FLAME produces blank outputs if "Intercept" in column_names: cat_indices = [] col_name_indices = dmatrix.design_info.column_name_indexes for col_name in col_name_indices.keys(): if "C(" in col_name: cat_indices.append(int(col_name_indices[col_name])) # note: dmat_T is now no longer a DesignMatrix Patsy object, but only # an array dmat_T = dmatrix.transpose() for index in cat_indices: new_row = [] for val in dmat_T[index]: new_row.append(val - dmat_T[index].mean()) dmat_T[index] = new_row # we can go back, but we won't be the same dmatrix = dmat_T.transpose() readme_flags.append("cat_demeaned") dmatrix_df = pd.DataFrame(np.array(dmatrix), index=model_df["participant_id"], columns=dmatrix_column_names) cols = dmatrix_df.columns.tolist() # make sure "column_names" is in the same order as the original EV column # header ordering in model_df - mainly for repeated measures, to make sure # participants_<ID> cols are at end for clarity for users dmat_cols = [] dmat_id_cols = [] for dmat_col in cols: if 'participant_' in dmat_col: dmat_id_cols.append(dmat_col) else: dmat_cols.append(dmat_col) column_names = dmat_cols dmat_id_cols = sorted(dmat_id_cols) column_names += dmat_id_cols dmatrix_df = dmatrix_df[column_names] dmat_csv_path = os.path.join(model_path, "design_matrix.csv") write_design_matrix_csv(dmatrix_df, model_df["participant_id"], column_names, dmat_csv_path) # time for contrasts if (group_config_obj.custom_contrasts == None) or (group_config_obj.contrasts == None): # if no custom contrasts matrix CSV provided (i.e. the user # specified contrasts in the GUI) contrasts_columns = column_names if group_config_obj.f_tests: for i in group_config_obj.f_tests[1:len(group_config_obj.f_tests) - 1]: contrasts_columns.append('f_test_{0}'.format(i)) else: pass contrast_out_path = os.path.join(model_dir, "contrasts.csv") if preset: cons = pd.read_csv(group_config_obj.custom_contrasts) with open(contrast_out_path, "w") as f: cons.to_csv(f, index=False) else: if os.path.isfile(contrast_out_path): contrasts_df = pd.read_csv(contrast_out_path) if contrasts_df.shape[0] > 1 or np.count_nonzero( contrasts_df.values[0][1:]) > 0: msg = "\n\n[!] C-PAC says: It appears you have modified your " \ "contrasts CSV file already- back up this file before " \ "building your model again to avoid overwriting your " \ "changes.\n\nContrasts file:\n{0}" \ "\n\n".format(contrast_out_path) raise Exception(msg) with open(contrast_out_path, "w") as f: f.write('Contrasts') for col in contrasts_columns: f.write(',{0}'.format(col)) f.write('\ncontrast_1') for col in contrasts_columns: f.write(',0') groups_out_path = os.path.join(model_path, 'groups.txt') with open(groups_out_path, 'w') as f: for val in grp_vector: f.write('{0}\n'.format(val)) msg = 'Model successfully generated for..\nDerivative: {0}\nSession: {1}' \ '\nScan: {2}\nPreprocessing strategy:\n {3}\n\nModel directory:' \ '\n{4}\n\nGroup configuration file:\n{5}\n\nContrasts template CSV:' \ '\n{6}\n\nDefine your contrasts in this contrasts template CSV and ' \ 'save your changes, then run FSL-FEAT either using the GUI ' \ 'interface or through the command-line like so:\n\n cpac group ' \ 'feat run <path to group config.yml>' \ '\n'.format(resource_id, session_id, series_or_repeated_label, preproc_strat, model_path, group_config_file, contrast_out_path) print( '-------------------------------------------------------------------') print(msg) print( '-------------------------------------------------------------------') return dmat_csv_path, new_sub_file, contrast_out_path
def prep_group_analysis_workflow(model_df, pipeline_config_path, \ model_name, group_config_path, resource_id, preproc_strat, \ series_or_repeated_label): # # this function runs once per derivative type and preproc strat combo # during group analysis # import os import patsy import numpy as np import nipype.pipeline.engine as pe import nipype.interfaces.utility as util import nipype.interfaces.io as nio from CPAC.pipeline.cpac_group_runner import load_config_yml from CPAC.utils.create_flame_model_files import create_flame_model_files from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv pipeline_config_obj = load_config_yml(pipeline_config_path) group_config_obj = load_config_yml(group_config_path) pipeline_ID = pipeline_config_obj.pipelineName # remove file names from preproc_strat filename = preproc_strat.split("/")[-1] preproc_strat = preproc_strat.replace(filename, "") preproc_strat = preproc_strat.lstrip("/").rstrip("/") # get thresholds z_threshold = float(group_config_obj.z_threshold[0]) p_threshold = float(group_config_obj.p_threshold[0]) sub_id_label = group_config_obj.participant_id_label ftest_list = [] readme_flags = [] # determine if f-tests are included or not custom_confile = group_config_obj.custom_contrasts if ((custom_confile == None) or (custom_confile == '') or \ ("None" in custom_confile) or ("none" in custom_confile)): custom_confile = None if (len(group_config_obj.f_tests) == 0) or \ (group_config_obj.f_tests == None): fTest = False else: fTest = True ftest_list = group_config_obj.f_tests else: if not os.path.exists(custom_confile): errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \ ".CSV file for your group model, but this file cannot " \ "be found. Please double-check the filepath you have " \ "entered.\n\nFilepath: %s\n\n" % custom_confile raise Exception(errmsg) with open(custom_confile, "r") as f: evs = f.readline() evs = evs.rstrip('\r\n').split(',') count_ftests = 0 fTest = False for ev in evs: if "f_test" in ev: count_ftests += 1 if count_ftests > 0: fTest = True # create path for output directory out_dir = os.path.join(group_config_obj.output_dir, "group_analysis_results_%s" % pipeline_ID, "group_model_%s" % model_name, resource_id, series_or_repeated_label, preproc_strat) if 'sca_roi' in resource_id: out_dir = os.path.join(out_dir, re.search('sca_ROI_(\d)+',os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id: out_dir = os.path.join(out_dir, re.search('temp_reg_map_z_(\d)+',os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'centrality' in resource_id: names = ['degree_centrality_binarize', 'degree_centrality_weighted', \ 'eigenvector_centrality_binarize', 'eigenvector_centrality_weighted', 'lfcd_binarize', 'lfcd_weighted'] for name in names: if name in filename: out_dir = os.path.join(out_dir, name) break if 'tempreg_maps' in resource_id: out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+', \ os.path.splitext(os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) model_path = os.path.join(out_dir, 'model_files') second_half_out = \ out_dir.split("group_analysis_results_%s" % pipeline_ID)[1] # generate working directory for this output's group analysis run work_dir = os.path.join(pipeline_config_obj.workingDirectory, \ "group_analysis", second_half_out.lstrip("/")) log_dir = os.path.join(pipeline_config_obj.logDirectory, \ "group_analysis", second_half_out.lstrip("/")) # create the actual directories create_dir(model_path, "group analysis output") create_dir(work_dir, "group analysis working") create_dir(log_dir, "group analysis logfile") # create new subject list based on which subjects are left after checking # for missing outputs new_participant_list = [] for part in list(model_df["Participant"]): # do this instead of using "set" just in case, to preserve order # only reason there may be duplicates is because of multiple-series # repeated measures runs if part not in new_participant_list: new_participant_list.append(part) new_sub_file = write_new_sub_file(model_path, \ group_config_obj.participant_list, \ new_participant_list) group_config_obj.update('participant_list', new_sub_file) num_subjects = len(list(model_df["Participant"])) # start processing the dataframe further design_formula = group_config_obj.design_formula # demean EVs set for demeaning for demean_EV in group_config_obj.ev_selections.get("demean", []): model_df[demean_EV] = model_df[demean_EV].astype(float) model_df[demean_EV] = model_df[demean_EV].sub( model_df[demean_EV].mean()) # demean the motion params if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula): params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"] for param in params: model_df[param] = model_df[param].astype(float) model_df[param] = model_df[param].sub(model_df[param].mean()) # create 4D merged copefile, in the correct order, identical to design # matrix merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz" merge_outfile = os.path.join(model_path, merge_outfile) merge_file = create_merged_copefile(list(model_df["Filepath"]), \ merge_outfile) # create merged group mask merge_mask_outfile = model_name + "_" + resource_id + \ "_merged_mask.nii.gz" merge_mask_outfile = os.path.join(model_path, merge_mask_outfile) merge_mask = create_merge_mask(merge_file, merge_mask_outfile) if "Group Mask" in group_config_obj.mean_mask: mask_for_means = merge_mask else: individual_masks_dir = os.path.join(model_path, "individual_masks") create_dir(individual_masks_dir, "individual masks") for unique_id, series_id, raw_filepath in zip( model_df["Participant"], model_df["Series"], model_df["Raw_Filepath"]): mask_for_means_path = os.path.join( individual_masks_dir, "%s_%s_%s_mask.nii.gz" % (unique_id, series_id, resource_id)) mask_for_means = create_merge_mask(raw_filepath, mask_for_means_path) readme_flags.append("individual_masks") # calculate measure means, and demean if "Measure_Mean" in design_formula: model_df = calculate_measure_mean_in_df(model_df, mask_for_means) # calculate custom ROIs, and demean (in workflow?) if "Custom_ROI_Mean" in design_formula: custom_roi_mask = group_config_obj.custom_roi_mask if (custom_roi_mask == None) or (custom_roi_mask == "None") or \ (custom_roi_mask == "none") or (custom_roi_mask == ""): err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \ "formula, but you didn't supply a custom ROI mask file." \ "\n\nDesign formula: %s\n\n" % design_formula raise Exception(err) # make sure the custom ROI mask file is the same resolution as the # output files - if not, resample and warn the user roi_mask = check_mask_file_resolution(list(model_df["Raw_Filepath"])[0], \ custom_roi_mask, mask_for_means, \ model_path, resource_id) # trim the custom ROI mask to be within mask constraints output_mask = os.path.join(model_path, "masked_%s" \ % os.path.basename(roi_mask)) roi_mask = trim_mask(roi_mask, mask_for_means, output_mask) readme_flags.append("custom_roi_mask_trimmed") # calculate model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask) # update the design formula new_design_substring = "" for col in model_df.columns: if "Custom_ROI_Mean_" in str(col): if str(col) == "Custom_ROI_Mean_1": new_design_substring = new_design_substring + " %s" % col else: new_design_substring = new_design_substring + " + %s" % col design_formula = design_formula.replace("Custom_ROI_Mean", new_design_substring) cat_list = [] if "categorical" in group_config_obj.ev_selections.keys(): cat_list = group_config_obj.ev_selections["categorical"] # prep design for repeated measures, if applicable if len(group_config_obj.sessions_list) > 0: design_formula = design_formula + " + Session" if "Session" not in cat_list: cat_list.append("Session") if len(group_config_obj.series_list) > 0: design_formula = design_formula + " + Series" if "Series" not in cat_list: cat_list.append("Series") for col in list(model_df.columns): if "participant_" in col: design_formula = design_formula + " + %s" % col cat_list.append(col) # parse out the EVs in the design formula at this point in time # this is essentially a list of the EVs that are to be included ev_list = parse_out_covariates(design_formula) # SPLIT GROUPS here. # CURRENT PROBLEMS: was creating a few doubled-up new columns grp_vector = [1] * num_subjects if group_config_obj.group_sep: # model group variances separately old_ev_list = ev_list model_df, grp_vector, ev_list, cat_list = split_groups(model_df, \ group_config_obj.grouping_var, \ ev_list, cat_list) # make the grouping variable categorical for Patsy (if we try to # do this automatically below, it will categorical-ize all of # the substrings too) design_formula = design_formula.replace(group_config_obj.grouping_var, \ "C(" + group_config_obj.grouping_var + ")") if group_config_obj.coding_scheme == "Sum": design_formula = design_formula.replace(")", ", Sum)") # update design formula rename = {} for old_ev in old_ev_list: for new_ev in ev_list: if old_ev + "__FOR" in new_ev: if old_ev not in rename.keys(): rename[old_ev] = [] rename[old_ev].append(new_ev) for old_ev in rename.keys(): design_formula = design_formula.replace(old_ev, \ " + ".join(rename[old_ev])) # prep design formula for Patsy design_formula = patsify_design_formula(design_formula, cat_list, \ group_config_obj.coding_scheme[0]) print design_formula # send to Patsy try: dmatrix = patsy.dmatrix(design_formula, model_df) except Exception as e: err = "\n\n[!] Something went wrong with processing the group model "\ "design matrix using the Python Patsy package. Patsy might " \ "not be properly installed, or there may be an issue with the "\ "formatting of the design matrix.\n\nPatsy-formatted design " \ "formula: %s\n\nError details: %s\n\n" \ % (model_df.columns, design_formula, e) raise Exception(err) print dmatrix.design_info.column_names print dmatrix # check the model for multicollinearity - Patsy takes care of this, but # just in case check_multicollinearity(np.array(dmatrix)) # prepare for final stages column_names = dmatrix.design_info.column_names # what is this for? design_matrix = np.array(dmatrix, dtype=np.float16) # check to make sure there are more time points than EVs! if len(column_names) >= num_subjects: err = "\n\n[!] CPAC says: There are more EVs than there are " \ "participants currently included in the model for %s. There " \ "must be more participants than EVs in the design.\n\nNumber " \ "of participants: %d\nNumber of EVs: %d\n\nEV/covariate list: "\ "%s\n\nNote: If you specified to model group " \ "variances separately, the amount of EVs can nearly double " \ "once they are split along the grouping variable.\n\n" \ "If the number of subjects is lower than the number of " \ "subjects in your group analysis subject list, this may be " \ "because not every subject in the subject list has an output " \ "for %s in the individual-level analysis output directory.\n\n"\ % (resource_id, num_subjects, len(column_names), column_names, \ resource_id) raise Exception(err) # time for contrasts contrasts_list = None contrasts_vectors = None if ((custom_confile == None) or (custom_confile == '') or \ ("None" in custom_confile) or ("none" in custom_confile)): # if no custom contrasts matrix CSV provided (i.e. the user # specified contrasts in the GUI) contrasts_list = group_config_obj.contrasts contrasts_vectors = create_contrasts_dict(dmatrix, contrasts_list, resource_id) # check the merged file's order check_merged_file(model_df["Filepath"], merge_file) # we must demean the categorical regressors if the Intercept/Grand Mean # is included in the model, otherwise FLAME produces blank outputs if "Intercept" in column_names: cat_indices = [] col_name_indices = dmatrix.design_info.column_name_indexes for col_name in col_name_indices.keys(): if "C(" in col_name: cat_indices.append(int(col_name_indices[col_name])) # note: dmat_T is now no longer a DesignMatrix Patsy object, but only # an array dmat_T = dmatrix.transpose() for index in cat_indices: new_row = [] for val in dmat_T[index]: new_row.append(val - dmat_T[index].mean()) dmat_T[index] = new_row # we can go back, but we won't be the same dmatrix = dmat_T.transpose() readme_flags.append("cat_demeaned") # send off the info so the FLAME input model files can be generated! mat_file, grp_file, con_file, fts_file = create_flame_model_files( dmatrix, column_names, contrasts_vectors, contrasts_list, custom_confile, ftest_list, group_config_obj.group_sep, grp_vector, group_config_obj.coding_scheme[0], model_name, resource_id, model_path) dmat_csv_path = os.path.join(model_path, "design_matrix.csv") write_design_matrix_csv(dmatrix, model_df["Participant"], column_names, dmat_csv_path) # workflow time wf_name = "%s_%s" % (resource_id, series_or_repeated_label) wf = pe.Workflow(name=wf_name) wf.base_dir = work_dir crash_dir = os.path.join(pipeline_config_obj.crashLogDirectory, "group_analysis", model_name) wf.config['execution'] = { 'hash_method': 'timestamp', 'crashdump_dir': crash_dir } # gpa_wf # Creates the actual group analysis workflow gpa_wf = create_group_analysis(fTest, "gp_analysis_%s" % wf_name) gpa_wf.inputs.inputspec.merged_file = merge_file gpa_wf.inputs.inputspec.merge_mask = merge_mask gpa_wf.inputs.inputspec.z_threshold = z_threshold gpa_wf.inputs.inputspec.p_threshold = p_threshold gpa_wf.inputs.inputspec.parameters = (pipeline_config_obj.FSLDIR, 'MNI152') gpa_wf.inputs.inputspec.mat_file = mat_file gpa_wf.inputs.inputspec.con_file = con_file gpa_wf.inputs.inputspec.grp_file = grp_file if fTest: gpa_wf.inputs.inputspec.fts_file = fts_file # ds # Creates the datasink node for group analysis ds = pe.Node(nio.DataSink(), name='gpa_sink') # if c.mixedScanAnalysis == True: # out_dir = re.sub(r'(\w)*scan_(\w)*(\d)*(\w)*[/]', '', out_dir) ds.inputs.base_directory = str(out_dir) ds.inputs.container = '' ds.inputs.regexp_substitutions = [(r'(?<=rendered)(.)*[/]', '/'), (r'(?<=model_files)(.)*[/]', '/'), (r'(?<=merged)(.)*[/]', '/'), (r'(?<=stats/clusterMap)(.)*[/]', '/'), (r'(?<=stats/unthreshold)(.)*[/]', '/'), (r'(?<=stats/threshold)(.)*[/]', '/'), (r'_cluster(.)*[/]', ''), (r'_slicer(.)*[/]', ''), (r'_overlay(.)*[/]', '')] ########datasink connections######### #if fTest: # wf.connect(gp_flow, 'outputspec.fts', # ds, 'model_files.@0') #wf.connect(gp_flow, 'outputspec.mat', # ds, 'model_files.@1' ) #wf.connect(gp_flow, 'outputspec.con', # ds, 'model_files.@2') #wf.connect(gp_flow, 'outputspec.grp', # ds, 'model_files.@3') wf.connect(gpa_wf, 'outputspec.merged', ds, 'merged') wf.connect(gpa_wf, 'outputspec.zstats', ds, 'stats.unthreshold') wf.connect(gpa_wf, 'outputspec.zfstats', ds, 'stats.unthreshold.@01') wf.connect(gpa_wf, 'outputspec.fstats', ds, 'stats.unthreshold.@02') wf.connect(gpa_wf, 'outputspec.cluster_threshold_zf', ds, 'stats.threshold') wf.connect(gpa_wf, 'outputspec.cluster_index_zf', ds, 'stats.clusterMap') wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt_zf', ds, 'stats.clusterMap.@01') wf.connect(gpa_wf, 'outputspec.overlay_threshold_zf', ds, 'rendered') wf.connect(gpa_wf, 'outputspec.rendered_image_zf', ds, 'rendered.@01') wf.connect(gpa_wf, 'outputspec.cluster_threshold', ds, 'stats.threshold.@01') wf.connect(gpa_wf, 'outputspec.cluster_index', ds, 'stats.clusterMap.@02') wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt', ds, 'stats.clusterMap.@03') wf.connect(gpa_wf, 'outputspec.overlay_threshold', ds, 'rendered.@02') wf.connect(gpa_wf, 'outputspec.rendered_image', ds, 'rendered.@03') ###################################### # Run the actual group analysis workflow wf.run() print "\n\nWorkflow finished for model %s\n\n" % wf_name
def build_feat_model(model_df, model_name, group_config_file, resource_id, preproc_strat, session_id, series_or_repeated_label): # # this function runs once per derivative type and preproc strat combo # during group analysis # import os import patsy import pandas as pd import numpy as np import nipype.pipeline.engine as pe import nipype.interfaces.utility as util import nipype.interfaces.io as nio from CPAC.pipeline.cpac_group_runner import load_config_yml from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv, \ write_blank_contrast_csv group_config_obj = load_config_yml(group_config_file) pipeline_ID = group_config_obj.pipeline_dir.rstrip('/').split('/')[-1] #sublist_txt = group_config_obj.participant_list #if sublist_txt == None: # print ("Warning! You have not provided a subject list. CPAC will use all the subjects in pipeline directory") # sublist_txt = group_config_obj.participant_list #else: # sublist_txt = group_config_obj.particpant_list # remove file names from preproc_strat filename = preproc_strat.split("/")[-1] preproc_strat = preproc_strat.replace('.nii', '').replace('.gz', '') preproc_strat = preproc_strat.lstrip("/").rstrip("/") ftest_list = [] readme_flags = [] # determine if f-tests are included or not custom_confile = group_config_obj.custom_contrasts if ((custom_confile is None) or (custom_confile == '') or ("None" in custom_confile) or ("none" in custom_confile)): custom_confile = None # if (len(group_config_obj.f_tests) == 0) or \ # (group_config_obj.f_tests is None): # fTest = False # else: # fTest = True # ftest_list = group_config_obj.f_tests #else: # if not os.path.exists(custom_confile): # errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \ # ".CSV file for your group model, but this file cannot " \ # "be found. Please double-check the filepath you have " \ # "entered.\n\nFilepath: %s\n\n" % custom_confile # raise Exception(errmsg)# # with open(custom_confile, "r") as f: # evs = f.readline() # evs = evs.rstrip('\r\n').split(',') # count_ftests = 0 # fTest = False # for ev in evs: # if "f_test" in ev: # count_ftests += 1 # create path for output directory model_dir = os.path.join(group_config_obj.output_dir, 'cpac_group_analysis', 'FSL_FEAT', '{0}'.format(pipeline_ID), 'group_model_{0}'.format(model_name)) out_dir = os.path.join(model_dir, resource_id, session_id, series_or_repeated_label, preproc_strat) try: preset_contrast = group_config_obj.preset preset = True except AttributeError: preset = False if 'sca_roi' in resource_id: out_dir = os.path.join(out_dir, re.search('sca_ROI_(\d)+', os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id: out_dir = os.path.join(out_dir, re.search('temp_reg_map_z_(\d)+', os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'centrality' in resource_id: names = ['degree_centrality_binarize', 'degree_centrality_weighted', 'eigenvector_centrality_binarize', 'eigenvector_centrality_weighted', 'lfcd_binarize', 'lfcd_weighted'] for name in names: if name in filename: out_dir = os.path.join(out_dir, name) break if 'tempreg_maps' in resource_id: out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+', os.path.splitext(os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) model_path = os.path.join(out_dir, 'model_files') # create the actual directories create_dir(model_path, "group analysis output") # create new subject list based on which subjects are left after checking # for missing outputs new_participant_list = [] for part in model_df["participant_id"]: # do this instead of using "set" just in case, to preserve order # only reason there may be duplicates is because of multiple-series # repeated measures runs if part not in new_participant_list: new_participant_list.append(part) if group_config_obj.participant_list == None: #participant_list = os.listdir(group_config_obj.pipeline_dir) new_sub_file = write_new_sub_file(model_path, group_config_obj.pipeline_dir, new_participant_list) else: new_sub_file = write_new_sub_file(model_path, group_config_obj.participant_list, new_participant_list) group_config_obj.update('participant_list', new_sub_file) num_subjects = len(list(model_df["participant_id"])) # start processing the dataframe further design_formula = group_config_obj.design_formula # demean EVs set for demeaning for demean_EV in group_config_obj.ev_selections.get("demean",[]): model_df[demean_EV] = model_df[demean_EV].astype(float) model_df[demean_EV] = model_df[demean_EV].sub(model_df[demean_EV].mean()) # demean the motion params if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula): params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"] for param in params: model_df[param] = model_df[param].astype(float) model_df[param] = model_df[param].sub(model_df[param].mean()) # create 4D merged copefile, in the correct order, identical to design # matrix merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz" merge_outfile = os.path.join(model_path, merge_outfile) merge_file = create_merged_copefile(model_df["Filepath"].tolist(), merge_outfile) # create merged group mask merge_mask_outfile = '_'.join([model_name, resource_id, "merged_mask.nii.gz"]) merge_mask_outfile = os.path.join(model_path, merge_mask_outfile) merge_mask = create_merge_mask(merge_file, merge_mask_outfile) if "Group Mask" in group_config_obj.mean_mask: mask_for_means = merge_mask else: individual_masks_dir = os.path.join(model_path, "individual_masks") create_dir(individual_masks_dir, "individual masks") for unique_id, series_id, raw_filepath in zip( model_df["participant_id"], model_df["Series"], model_df["Raw_Filepath"]): mask_for_means_path = os.path.join(individual_masks_dir, "%s_%s_%s_mask.nii.gz" % ( unique_id, series_id, resource_id)) mask_for_means = create_merge_mask(raw_filepath, mask_for_means_path) readme_flags.append("individual_masks") # calculate measure means, and demean if "Measure_Mean" in design_formula: model_df = calculate_measure_mean_in_df(model_df, mask_for_means) # calculate custom ROIs, and demean (in workflow?) if "Custom_ROI_Mean" in design_formula: custom_roi_mask = group_config_obj.custom_roi_mask if (custom_roi_mask == None) or (custom_roi_mask == "None") or \ (custom_roi_mask == "none") or (custom_roi_mask == ""): err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \ "formula, but you didn't supply a custom ROI mask file." \ "\n\nDesign formula: %s\n\n" % design_formula raise Exception(err) # make sure the custom ROI mask file is the same resolution as the # output files - if not, resample and warn the user roi_mask = check_mask_file_resolution(list(model_df["Raw_Filepath"])[0], custom_roi_mask, mask_for_means, model_path, resource_id) # trim the custom ROI mask to be within mask constraints output_mask = os.path.join(model_path, "masked_%s" \ % os.path.basename(roi_mask)) roi_mask = trim_mask(roi_mask, mask_for_means, output_mask) readme_flags.append("custom_roi_mask_trimmed") # calculate model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask) # update the design formula new_design_substring = "" for col in model_df.columns: if "Custom_ROI_Mean_" in str(col): if str(col) == "Custom_ROI_Mean_1": new_design_substring = new_design_substring + " %s" % col else: new_design_substring = new_design_substring +" + %s" % col design_formula = design_formula.replace("Custom_ROI_Mean", new_design_substring) cat_list = [] if "categorical" in group_config_obj.ev_selections.keys(): cat_list = group_config_obj.ev_selections["categorical"] # prep design for repeated measures, if applicable if len(group_config_obj.sessions_list) > 0: if "session" in model_df.columns: # if these columns were added by the model builder automatically design_formula = design_formula + " + session" if "session" not in cat_list: cat_list.append("session") if len(group_config_obj.series_list) > 0: design_formula = design_formula + " + Series" if "Series" not in cat_list: cat_list.append("Series") if "session" in model_df.columns: # if these columns were added by the model builder automatically for col in model_df.columns: # should only grab the repeated measures-designed participant_{ID} # columns, not the "participant_id" column! if "participant_" in col and "_id" not in col: design_formula = design_formula + " + %s" % col cat_list.append(col) # parse out the EVs in the design formula at this point in time # this is essentially a list of the EVs that are to be included ev_list = parse_out_covariates(design_formula) # SPLIT GROUPS here. # CURRENT PROBLEMS: was creating a few doubled-up new columns grp_vector = [1] * num_subjects if group_config_obj.group_sep: # check if the group_ev parameter is a list instead of a string: # this was added to handle the new group-level analysis presets. this # is the only modification that was required to the group analysis # workflow, and it handles cases where the group variances must be # modeled separately, by creating separate groups for the FSL FLAME # .grp file. # the group_ev parameter gets sent in as a list if coming from any # of the presets that deal with multiple groups- in these cases, # the pheno_df/design matrix is already set up properly for the # multiple groups, and we need to bypass all of the processing # that usually occurs when the "modeling group variances # separately" option is enabled in the group analysis config YAML group_ev = group_config_obj.grouping_var if isinstance(group_ev, list) or "," in group_ev: grp_vector = [] if "," in group_ev: group_ev = group_ev.split(",") if len(group_ev) == 2: for x, y in zip(model_df[group_ev[0]], model_df[group_ev[1]]): if x == 1: grp_vector.append(1) elif y == 1: grp_vector.append(2) else: err = "\n\n[!] The two categorical covariates you " \ "provided as the two separate groups (in order " \ "to model each group's variances separately) " \ "either have more than 2 levels (1/0), or are " \ "not encoded as 1's and 0's.\n\nCovariates:\n" \ "{0}\n{1}\n\n".format(group_ev[0], group_ev[1]) raise Exception(err) elif len(group_ev) == 3: for x, y, z in zip(model_df[group_ev[0]], model_df[group_ev[1]], model_df[group_ev[2]]): if x == 1: grp_vector.append(1) elif y == 1: grp_vector.append(2) elif z == 1: grp_vector.append(3) else: err = "\n\n[!] The three categorical covariates you " \ "provided as the three separate groups (in order " \ "to model each group's variances separately) " \ "either have more than 2 levels (1/0), or are " \ "not encoded as 1's and 0's.\n\nCovariates:\n" \ "{0}\n{1}\n{2}\n\n".format(group_ev[0], group_ev[1], group_ev[2]) raise Exception(err) else: # we're only going to see this if someone plays around with # their preset or config file manually err = "\n\n[!] If you are seeing this message, it's because:\n" \ "1. You are using the group-level analysis presets\n" \ "2. You are running a model with multiple groups having " \ "their variances modeled separately (i.e. multiple " \ "values in the FSL FLAME .grp input file), and\n" \ "3. For some reason, the configuration has been set up " \ "in a way where CPAC currently thinks you're including " \ "only one group, or more than three, neither of which " \ "are supported.\n\nGroups provided:\n{0}" \ "\n\n".format(str(group_ev)) raise Exception(err) else: # model group variances separately old_ev_list = ev_list model_df, grp_vector, ev_list, cat_list = split_groups(model_df, group_config_obj.grouping_var, ev_list, cat_list) # make the grouping variable categorical for Patsy (if we try to # do this automatically below, it will categorical-ize all of # the substrings too) design_formula = design_formula.replace(group_config_obj.grouping_var, "C(" + group_config_obj.grouping_var + ")") if group_config_obj.coding_scheme == "Sum": design_formula = design_formula.replace(")", ", Sum)") # update design formula rename = {} for old_ev in old_ev_list: for new_ev in ev_list: if old_ev + "__FOR" in new_ev: if old_ev not in rename.keys(): rename[old_ev] = [] rename[old_ev].append(new_ev) for old_ev in rename.keys(): design_formula = design_formula.replace(old_ev, " + ".join(rename[old_ev])) # prep design formula for Patsy design_formula = patsify_design_formula(design_formula, cat_list, group_config_obj.coding_scheme[0]) if not preset: # send to Patsy try: dmatrix = patsy.dmatrix(design_formula, model_df) dmatrix.design_info.column_names.append(model_df["Filepath"]) dmatrix_column_names = dmatrix.design_info.column_names except Exception as e: err = "\n\n[!] Something went wrong with processing the group model "\ "design matrix using the Python Patsy package. Patsy might " \ "not be properly installed, or there may be an issue with the "\ "formatting of the design matrix.\n\nDesign matrix columns: " \ "%s\n\nPatsy-formatted design formula: %s\n\nError details: " \ "%s\n\n" % (model_df.columns, design_formula, e) raise Exception(err) else: if 'Sessions' in model_df: sess_levels = list(set(list(model_df['Sessions'].values))) if len(sess_levels) > 1: sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'} if len(sess_levels) == 3: sess_map.update({sess_levels[2]: '0'}) new_sess = [s.replace(s, sess_map[s]) for s in list(model_df['Sessions'].values)] model_df['Sessions'] = new_sess if 'Series' in model_df: sess_levels = list(set(list(model_df['Series'].values))) if len(sess_levels) > 1: sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'} if len(sess_levels) == 3: sess_map.update({sess_levels[2]: '0'}) new_sess = [s.replace(s, sess_map[s]) for s in list(model_df['Series'].values)] model_df['Series'] = new_sess keep_cols = [x for x in model_df.columns if x in design_formula] dmatrix = model_df[keep_cols].astype('float') dmatrix_column_names = list(dmatrix.columns) # check the model for multicollinearity - Patsy takes care of this, but # just in case check_multicollinearity(np.array(dmatrix)) dmat_csv_path = os.path.join(model_path, "design_matrix.csv") contrast_out_path = os.path.join(out_dir, "contrast.csv") # make sure "column_names" is in the same order as the original EV column # header ordering in model_df - mainly for repeated measures, to make sure # participants_<ID> cols are at end for clarity for users dmat_cols = [] dmat_id_cols = [] for dmat_col in dmatrix_column_names: if 'participant_' in dmat_col: dmat_id_cols.append(dmat_col) else: dmat_cols.append(dmat_col) column_names = dmat_cols dmat_id_cols = sorted(dmat_id_cols) column_names += dmat_id_cols # check to make sure there are more time points than EVs! if len(column_names) >= num_subjects: err = "\n\n################## MODEL NOT GENERATED ##################" \ "\n\n[!] CPAC says: There are more EVs than there are " \ "participants currently included in the model for:\n\n" \ "Derivative: {0}\nSession: {1}\nScan: {2}\nPreproc strategy:" \ "\n {3}\n\n" \ "There must be more participants than EVs in the design.\n\n" \ "Number of participants: {4}\nNumber of EVs: {5}\n\nEV/" \ "covariate list: {6}\n\nNote: If you specified to model group " \ "variances separately, the amount of EVs can nearly double " \ "once they are split along the grouping variable.\n\nIf the " \ "number of participants is lower than the number of " \ "participants in your group analysis inclusion list, this " \ "may be because not every participant originally included has " \ "an output for {7} for this scan and preprocessing strategy in " \ "the individual-level analysis output directory.\n\nDesign " \ "formula going in: {8}" \ "\n\n#########################################################" \ "\n\n".format(resource_id, session_id, series_or_repeated_label, preproc_strat, num_subjects, len(column_names), column_names, resource_id, design_formula) print(err) # check the merged file's order check_merged_file(model_df["Filepath"], merge_file) # we must demean the categorical regressors if the Intercept/Grand Mean # is included in the model, otherwise FLAME produces blank outputs if "Intercept" in column_names: cat_indices = [] col_name_indices = dmatrix.design_info.column_name_indexes for col_name in col_name_indices.keys(): if "C(" in col_name: cat_indices.append(int(col_name_indices[col_name])) # note: dmat_T is now no longer a DesignMatrix Patsy object, but only # an array dmat_T = dmatrix.transpose() for index in cat_indices: new_row = [] for val in dmat_T[index]: new_row.append(val - dmat_T[index].mean()) dmat_T[index] = new_row # we can go back, but we won't be the same dmatrix = dmat_T.transpose() readme_flags.append("cat_demeaned") dmatrix_df = pd.DataFrame(np.array(dmatrix), index=model_df["participant_id"], columns=dmatrix_column_names) cols = dmatrix_df.columns.tolist() # make sure "column_names" is in the same order as the original EV column # header ordering in model_df - mainly for repeated measures, to make sure # participants_<ID> cols are at end for clarity for users dmat_cols = [] dmat_id_cols = [] for dmat_col in cols: if 'participant_' in dmat_col: dmat_id_cols.append(dmat_col) else: dmat_cols.append(dmat_col) column_names = dmat_cols dmat_id_cols = sorted(dmat_id_cols) column_names += dmat_id_cols dmatrix_df = dmatrix_df[column_names] dmat_csv_path = os.path.join(model_path, "design_matrix.csv") write_design_matrix_csv(dmatrix_df, model_df["participant_id"], column_names, dmat_csv_path) # time for contrasts if (group_config_obj.custom_contrasts == None) or (group_config_obj.contrasts == None): # if no custom contrasts matrix CSV provided (i.e. the user # specified contrasts in the GUI) contrasts_columns = column_names if group_config_obj.f_tests: for i in group_config_obj.f_tests[1:len(group_config_obj.f_tests)-1]: contrasts_columns.append('f_test_{0}'.format(i)) else: pass contrast_out_path = os.path.join(model_dir, "contrasts.csv") if preset: cons = pd.read_csv(group_config_obj.custom_contrasts) with open(contrast_out_path, "w") as f: cons.to_csv(f, index=False) else: if os.path.isfile(contrast_out_path): contrasts_df = pd.read_csv(contrast_out_path) if contrasts_df.shape[0] > 1 or np.count_nonzero(contrasts_df.values[0][1:]) > 0: msg = "\n\n[!] C-PAC says: It appears you have modified your " \ "contrasts CSV file already- back up this file before " \ "building your model again to avoid overwriting your " \ "changes.\n\nContrasts file:\n{0}" \ "\n\n".format(contrast_out_path) raise Exception(msg) with open(contrast_out_path, "w") as f: f.write('Contrasts') for col in contrasts_columns: f.write(',{0}'.format(col)) f.write('\ncontrast_1') for col in contrasts_columns: f.write(',0') groups_out_path = os.path.join(model_path, 'groups.txt') with open(groups_out_path, 'w') as f: for val in grp_vector: f.write('{0}\n'.format(val)) msg = 'Model successfully generated for..\nDerivative: {0}\nSession: {1}' \ '\nScan: {2}\nPreprocessing strategy:\n {3}\n\nModel directory:' \ '\n{4}\n\nGroup configuration file:\n{5}\n\nContrasts template CSV:' \ '\n{6}\n\nDefine your contrasts in this contrasts template CSV and ' \ 'save your changes, then run FSL-FEAT either using the GUI ' \ 'interface or through the command-line like so:\n\n cpac group ' \ 'feat run <path to group config.yml>' \ '\n'.format(resource_id, session_id, series_or_repeated_label, preproc_strat, model_path, group_config_file, contrast_out_path) print('-------------------------------------------------------------------') print(msg) print('-------------------------------------------------------------------') return dmat_csv_path, new_sub_file, contrast_out_path
def prep_group_analysis_workflow(model_df, pipeline_config_path, \ model_name, group_config_path, resource_id, preproc_strat, \ series_or_repeated_label): # # this function runs once per derivative type and preproc strat combo # during group analysis # import os import patsy import numpy as np import nipype.pipeline.engine as pe import nipype.interfaces.utility as util import nipype.interfaces.io as nio from CPAC.pipeline.cpac_group_runner import load_config_yml from CPAC.utils.create_flame_model_files import create_flame_model_files from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv pipeline_config_obj = load_config_yml(pipeline_config_path) group_config_obj = load_config_yml(group_config_path) pipeline_ID = pipeline_config_obj.pipelineName # remove file names from preproc_strat filename = preproc_strat.split("/")[-1] preproc_strat = preproc_strat.replace(filename,"") preproc_strat = preproc_strat.lstrip("/").rstrip("/") # get thresholds z_threshold = float(group_config_obj.z_threshold[0]) p_threshold = float(group_config_obj.p_threshold[0]) sub_id_label = group_config_obj.participant_id_label ftest_list = [] readme_flags = [] # determine if f-tests are included or not custom_confile = group_config_obj.custom_contrasts if ((custom_confile == None) or (custom_confile == '') or \ ("None" in custom_confile) or ("none" in custom_confile)): custom_confile = None if (len(group_config_obj.f_tests) == 0) or \ (group_config_obj.f_tests == None): fTest = False else: fTest = True ftest_list = group_config_obj.f_tests else: if not os.path.exists(custom_confile): errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \ ".CSV file for your group model, but this file cannot " \ "be found. Please double-check the filepath you have " \ "entered.\n\nFilepath: %s\n\n" % custom_confile raise Exception(errmsg) with open(custom_confile,"r") as f: evs = f.readline() evs = evs.rstrip('\r\n').split(',') count_ftests = 0 fTest = False for ev in evs: if "f_test" in ev: count_ftests += 1 if count_ftests > 0: fTest = True # create path for output directory out_dir = os.path.join(group_config_obj.output_dir, \ "group_analysis_results_%s" % pipeline_ID, \ "group_model_%s" % model_name, resource_id, \ series_or_repeated_label, preproc_strat) if 'sca_roi' in resource_id: out_dir = os.path.join(out_dir, \ re.search('sca_roi_(\d)+',os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id: out_dir = os.path.join(out_dir, \ re.search('temp_reg_map_z_(\d)+',os.path.splitext(\ os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) if 'centrality' in resource_id: names = ['degree_centrality_binarize', 'degree_centrality_weighted', \ 'eigenvector_centrality_binarize', \ 'eigenvector_centrality_weighted', \ 'lfcd_binarize', 'lfcd_weighted'] for name in names: if name in filename: out_dir = os.path.join(out_dir, name) break if 'tempreg_maps' in resource_id: out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+', \ os.path.splitext(os.path.splitext(os.path.basename(\ model_df["Filepath"][0]))[0])[0]).group(0)) model_path = os.path.join(out_dir, 'model_files') second_half_out = \ out_dir.split("group_analysis_results_%s" % pipeline_ID)[1] # generate working directory for this output's group analysis run work_dir = os.path.join(pipeline_config_obj.workingDirectory, \ "group_analysis", second_half_out.lstrip("/")) log_dir = os.path.join(pipeline_config_obj.logDirectory, \ "group_analysis", second_half_out.lstrip("/")) # create the actual directories create_dir(model_path, "group analysis output") create_dir(work_dir, "group analysis working") create_dir(log_dir, "group analysis logfile") # create new subject list based on which subjects are left after checking # for missing outputs new_participant_list = [] for part in list(model_df["Participant"]): # do this instead of using "set" just in case, to preserve order # only reason there may be duplicates is because of multiple-series # repeated measures runs if part not in new_participant_list: new_participant_list.append(part) new_sub_file = write_new_sub_file(model_path, \ group_config_obj.participant_list, \ new_participant_list) group_config_obj.update('participant_list',new_sub_file) num_subjects = len(list(model_df["Participant"])) # start processing the dataframe further design_formula = group_config_obj.design_formula # demean EVs set for demeaning for demean_EV in group_config_obj.ev_selections["demean"]: model_df[demean_EV] = model_df[demean_EV].astype(float) model_df[demean_EV] = model_df[demean_EV].sub(model_df[demean_EV].mean()) # demean the motion params if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula): params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"] for param in params: model_df[param] = model_df[param].astype(float) model_df[param] = model_df[param].sub(model_df[param].mean()) # create 4D merged copefile, in the correct order, identical to design # matrix merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz" merge_outfile = os.path.join(model_path, merge_outfile) merge_file = create_merged_copefile(list(model_df["Filepath"]), \ merge_outfile) # create merged group mask merge_mask_outfile = model_name + "_" + resource_id + \ "_merged_mask.nii.gz" merge_mask_outfile = os.path.join(model_path, merge_mask_outfile) merge_mask = create_merge_mask(merge_file, merge_mask_outfile) if "Group Mask" in group_config_obj.mean_mask: mask_for_means = merge_mask else: individual_masks_dir = os.path.join(model_path, "individual_masks") create_dir(individual_masks_dir, "individual masks") for unique_id, series_id, raw_filepath in zip(model_df["Participant"], model_df["Series"], model_df["Raw_Filepath"]): mask_for_means_path = os.path.join(individual_masks_dir, "%s_%s_%s_mask.nii.gz" % (unique_id, series_id, resource_id)) mask_for_means = create_merge_mask(raw_filepath, mask_for_means_path) readme_flags.append("individual_masks") # calculate measure means, and demean if "Measure_Mean" in design_formula: model_df = calculate_measure_mean_in_df(model_df, mask_for_means) # calculate custom ROIs, and demean (in workflow?) if "Custom_ROI_Mean" in design_formula: custom_roi_mask = group_config_obj.custom_roi_mask if (custom_roi_mask == None) or (custom_roi_mask == "None") or \ (custom_roi_mask == "none") or (custom_roi_mask == ""): err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \ "formula, but you didn't supply a custom ROI mask file." \ "\n\nDesign formula: %s\n\n" % design_formula raise Exception(err) # make sure the custom ROI mask file is the same resolution as the # output files - if not, resample and warn the user roi_mask = check_mask_file_resolution(list(model_df["Raw_Filepath"])[0], \ custom_roi_mask, mask_for_means, \ model_path, resource_id) # trim the custom ROI mask to be within mask constraints output_mask = os.path.join(model_path, "masked_%s" \ % os.path.basename(roi_mask)) roi_mask = trim_mask(roi_mask, mask_for_means, output_mask) readme_flags.append("custom_roi_mask_trimmed") # calculate model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask) # update the design formula new_design_substring = "" for col in model_df.columns: if "Custom_ROI_Mean_" in str(col): if str(col) == "Custom_ROI_Mean_1": new_design_substring = new_design_substring + " %s" % col else: new_design_substring = new_design_substring +" + %s" % col design_formula = design_formula.replace("Custom_ROI_Mean", \ new_design_substring) cat_list = [] if "categorical" in group_config_obj.ev_selections.keys(): cat_list = group_config_obj.ev_selections["categorical"] # prep design for repeated measures, if applicable if len(group_config_obj.sessions_list) > 0: design_formula = design_formula + " + Session" if "Session" not in cat_list: cat_list.append("Session") if len(group_config_obj.series_list) > 0: design_formula = design_formula + " + Series" if "Series" not in cat_list: cat_list.append("Series") for col in list(model_df.columns): if "participant_" in col: design_formula = design_formula + " + %s" % col cat_list.append(col) # parse out the EVs in the design formula at this point in time # this is essentially a list of the EVs that are to be included ev_list = parse_out_covariates(design_formula) # SPLIT GROUPS here. # CURRENT PROBLEMS: was creating a few doubled-up new columns grp_vector = [1] * num_subjects if group_config_obj.group_sep: # model group variances separately old_ev_list = ev_list model_df, grp_vector, ev_list, cat_list = split_groups(model_df, \ group_config_obj.grouping_var, \ ev_list, cat_list) # make the grouping variable categorical for Patsy (if we try to # do this automatically below, it will categorical-ize all of # the substrings too) design_formula = design_formula.replace(group_config_obj.grouping_var, \ "C(" + group_config_obj.grouping_var + ")") if group_config_obj.coding_scheme == "Sum": design_formula = design_formula.replace(")", ", Sum)") # update design formula rename = {} for old_ev in old_ev_list: for new_ev in ev_list: if old_ev + "__FOR" in new_ev: if old_ev not in rename.keys(): rename[old_ev] = [] rename[old_ev].append(new_ev) for old_ev in rename.keys(): design_formula = design_formula.replace(old_ev, \ " + ".join(rename[old_ev])) # prep design formula for Patsy design_formula = patsify_design_formula(design_formula, cat_list, \ group_config_obj.coding_scheme[0]) print design_formula # send to Patsy try: dmatrix = patsy.dmatrix(design_formula, model_df) except Exception as e: err = "\n\n[!] Something went wrong with processing the group model "\ "design matrix using the Python Patsy package. Patsy might " \ "not be properly installed, or there may be an issue with the "\ "formatting of the design matrix.\n\nPatsy-formatted design " \ "formula: %s\n\nError details: %s\n\n" \ % (model_df.columns, design_formula, e) raise Exception(err) print dmatrix.design_info.column_names print dmatrix # check the model for multicollinearity - Patsy takes care of this, but # just in case check_multicollinearity(np.array(dmatrix)) # prepare for final stages column_names = dmatrix.design_info.column_names # what is this for? design_matrix = np.array(dmatrix, dtype=np.float16) # check to make sure there are more time points than EVs! if len(column_names) >= num_subjects: err = "\n\n[!] CPAC says: There are more EVs than there are " \ "participants currently included in the model for %s. There " \ "must be more participants than EVs in the design.\n\nNumber " \ "of participants: %d\nNumber of EVs: %d\n\nEV/covariate list: "\ "%s\n\nNote: If you specified to model group " \ "variances separately, the amount of EVs can nearly double " \ "once they are split along the grouping variable.\n\n" \ "If the number of subjects is lower than the number of " \ "subjects in your group analysis subject list, this may be " \ "because not every subject in the subject list has an output " \ "for %s in the individual-level analysis output directory.\n\n"\ % (resource_id, num_subjects, len(column_names), column_names, \ resource_id) raise Exception(err) # time for contrasts contrasts_dict = None if ((custom_confile == None) or (custom_confile == '') or \ ("None" in custom_confile) or ("none" in custom_confile)): # if no custom contrasts matrix CSV provided (i.e. the user # specified contrasts in the GUI) contrasts_list = group_config_obj.contrasts contrasts_dict = create_contrasts_dict(dmatrix, contrasts_list, resource_id) # check the merged file's order check_merged_file(model_df["Filepath"], merge_file) # we must demean the categorical regressors if the Intercept/Grand Mean # is included in the model, otherwise FLAME produces blank outputs if "Intercept" in column_names: cat_indices = [] col_name_indices = dmatrix.design_info.column_name_indexes for col_name in col_name_indices.keys(): if "C(" in col_name: cat_indices.append(int(col_name_indices[col_name])) # note: dmat_T is now no longer a DesignMatrix Patsy object, but only # an array dmat_T = dmatrix.transpose() for index in cat_indices: new_row = [] for val in dmat_T[index]: new_row.append(val - dmat_T[index].mean()) dmat_T[index] = new_row # we can go back, but we won't be the same dmatrix = dmat_T.transpose() readme_flags.append("cat_demeaned") # send off the info so the FLAME input model files can be generated! mat_file, grp_file, con_file, fts_file = create_flame_model_files(dmatrix, \ column_names, contrasts_dict, custom_confile, ftest_list, \ group_config_obj.group_sep, grp_vector, group_config_obj.coding_scheme[0], \ model_name, resource_id, model_path) dmat_csv_path = os.path.join(model_path, "design_matrix.csv") write_design_matrix_csv(dmatrix, model_df["Participant"], column_names, \ dmat_csv_path) # workflow time wf_name = "%s_%s" % (resource_id, series_or_repeated_label) wf = pe.Workflow(name=wf_name) wf.base_dir = work_dir crash_dir = os.path.join(pipeline_config_obj.crashLogDirectory, \ "group_analysis", model_name) wf.config['execution'] = {'hash_method': 'timestamp', \ 'crashdump_dir': crash_dir} # gpa_wf # Creates the actual group analysis workflow gpa_wf = create_group_analysis(fTest, "gp_analysis_%s" % wf_name) gpa_wf.inputs.inputspec.merged_file = merge_file gpa_wf.inputs.inputspec.merge_mask = merge_mask gpa_wf.inputs.inputspec.z_threshold = z_threshold gpa_wf.inputs.inputspec.p_threshold = p_threshold gpa_wf.inputs.inputspec.parameters = (pipeline_config_obj.FSLDIR, \ 'MNI152') gpa_wf.inputs.inputspec.mat_file = mat_file gpa_wf.inputs.inputspec.con_file = con_file gpa_wf.inputs.inputspec.grp_file = grp_file if fTest: gpa_wf.inputs.inputspec.fts_file = fts_file # ds # Creates the datasink node for group analysis ds = pe.Node(nio.DataSink(), name='gpa_sink') # if c.mixedScanAnalysis == True: # out_dir = re.sub(r'(\w)*scan_(\w)*(\d)*(\w)*[/]', '', out_dir) ds.inputs.base_directory = str(out_dir) ds.inputs.container = '' ds.inputs.regexp_substitutions = [(r'(?<=rendered)(.)*[/]','/'), (r'(?<=model_files)(.)*[/]','/'), (r'(?<=merged)(.)*[/]','/'), (r'(?<=stats/clusterMap)(.)*[/]','/'), (r'(?<=stats/unthreshold)(.)*[/]','/'), (r'(?<=stats/threshold)(.)*[/]','/'), (r'_cluster(.)*[/]',''), (r'_slicer(.)*[/]',''), (r'_overlay(.)*[/]','')] ########datasink connections######### #if fTest: # wf.connect(gp_flow, 'outputspec.fts', # ds, 'model_files.@0') #wf.connect(gp_flow, 'outputspec.mat', # ds, 'model_files.@1' ) #wf.connect(gp_flow, 'outputspec.con', # ds, 'model_files.@2') #wf.connect(gp_flow, 'outputspec.grp', # ds, 'model_files.@3') wf.connect(gpa_wf, 'outputspec.merged', ds, 'merged') wf.connect(gpa_wf, 'outputspec.zstats', ds, 'stats.unthreshold') wf.connect(gpa_wf, 'outputspec.zfstats', ds,'stats.unthreshold.@01') wf.connect(gpa_wf, 'outputspec.fstats', ds,'stats.unthreshold.@02') wf.connect(gpa_wf, 'outputspec.cluster_threshold_zf', ds, 'stats.threshold') wf.connect(gpa_wf, 'outputspec.cluster_index_zf', ds,'stats.clusterMap') wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt_zf', ds, 'stats.clusterMap.@01') wf.connect(gpa_wf, 'outputspec.overlay_threshold_zf', ds, 'rendered') wf.connect(gpa_wf, 'outputspec.rendered_image_zf', ds, 'rendered.@01') wf.connect(gpa_wf, 'outputspec.cluster_threshold', ds, 'stats.threshold.@01') wf.connect(gpa_wf, 'outputspec.cluster_index', ds, 'stats.clusterMap.@02') wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt', ds, 'stats.clusterMap.@03') wf.connect(gpa_wf, 'outputspec.overlay_threshold', ds, 'rendered.@02') wf.connect(gpa_wf, 'outputspec.rendered_image', ds, 'rendered.@03') ###################################### # Run the actual group analysis workflow wf.run() print "\n\nWorkflow finished for model %s\n\n" % wf_name