Python load_config_ymlの例

プログラミング言語: Python

名前空間/パッケージ名: CPAC.pipeline.cpac_group_runner

メソッド/関数: load_config_yml

hotexamples.comのコード掲載数: 7

Python load_config_yml - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのCPAC.pipeline.cpac_group_runner.load_config_ymlの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def run(group_config_path):
    import re
    import subprocess
    subprocess.getoutput('source ~/.bashrc')
    import os
    import sys
    import pickle
    import yaml

    group_config_obj = load_config_yml(group_config_path)
    pipeline_output_folder = group_config_obj.pipeline_dir

    if not group_config_obj.participant_list == None:
        s_paths = group_config_obj.participant_list
    else:
        s_paths = [
            x for x in os.listdir(pipeline_output_folder) if os.path.isdir(x)
        ]

    merged_file = randomise_merged_file(s_paths)

    out_file = randomise_merged_mask(s_paths)

    prep_randomise_workflow(group_config_obj,
                            merged_file=merged_file,
                            mask_file=out_file,
                            working_dir=None,
                            output_dir=None,
                            crash_dir=None)

コード例 #2

ファイルを表示

ファイル: cpac_randomise_pipeline.py プロジェクト: FCP-INDI/C-PAC

def load_subject_file(group_config_path):
    group_config_obj = load_config_yml(group_config_path)
    pipeline_output_folder = group_config_obj.pipeline_dir
    
    if not group_config_obj.participant_list == None:
        s_paths = group_config_obj.participant_list
    else:
        s_paths = [x for x in os.listdir(pipeline_output_folder) if os.path.isdir(x)]

コード例 #3

ファイルを表示

def load_subject_file(group_config_path):
    group_config_obj = load_config_yml(group_config_path)
    pipeline_output_folder = group_config_obj.pipeline_dir

    if not group_config_obj.participant_list == None:
        s_paths = group_config_obj.participant_list
    else:
        s_paths = [
            x for x in os.listdir(pipeline_output_folder) if os.path.isdir(x)
        ]

コード例 #4

ファイルを表示

def build_feat_model(model_df, model_name, group_config_file, resource_id,
                     preproc_strat, session_id, series_or_repeated_label):

    #
    # this function runs once per derivative type and preproc strat combo
    # during group analysis
    #

    import os
    import patsy
    import pandas as pd
    import numpy as np

    import nipype.pipeline.engine as pe
    import nipype.interfaces.utility as util
    import nipype.interfaces.io as nio
    from CPAC.pipeline.cpac_group_runner import load_config_yml

    from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv, \
        write_blank_contrast_csv

    group_config_obj = load_config_yml(group_config_file)

    pipeline_ID = group_config_obj.pipeline_dir.rstrip('/').split('/')[-1]
    #sublist_txt = group_config_obj.participant_list

    #if sublist_txt == None:
    #    print ("Warning! You have not provided a subject list. CPAC will use all the subjects in pipeline directory")
    #    sublist_txt = group_config_obj.participant_list
    #else:
    #    sublist_txt = group_config_obj.particpant_list

    # remove file names from preproc_strat
    filename = preproc_strat.split("/")[-1]
    preproc_strat = preproc_strat.replace('.nii', '').replace('.gz', '')
    preproc_strat = preproc_strat.lstrip("/").rstrip("/")

    ftest_list = []
    readme_flags = []

    # determine if f-tests are included or not
    custom_confile = group_config_obj.custom_contrasts

    if ((custom_confile is None) or (custom_confile == '')
            or ("None" in custom_confile) or ("none" in custom_confile)):
        custom_confile = None

    #    if (len(group_config_obj.f_tests) == 0) or \
    #            (group_config_obj.f_tests is None):
    #        fTest = False
    #    else:
    #        fTest = True
    #        ftest_list = group_config_obj.f_tests

    #else:
    #    if not os.path.exists(custom_confile):
    #        errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \
    #                 ".CSV file for your group model, but this file cannot " \
    #                 "be found. Please double-check the filepath you have " \
    #                 "entered.\n\nFilepath: %s\n\n" % custom_confile
    #        raise Exception(errmsg)#

    #    with open(custom_confile, "r") as f:
    #        evs = f.readline()

    #    evs = evs.rstrip('\r\n').split(',')
    #    count_ftests = 0

    #    fTest = False

    #    for ev in evs:
    #        if "f_test" in ev:
    #            count_ftests += 1

    # create path for output directory
    model_dir = os.path.join(group_config_obj.output_dir,
                             'cpac_group_analysis', 'FSL_FEAT',
                             '{0}'.format(pipeline_ID),
                             'group_model_{0}'.format(model_name))

    out_dir = os.path.join(model_dir, resource_id, session_id,
                           series_or_repeated_label, preproc_strat)

    try:
        preset_contrast = group_config_obj.preset
        preset = True
    except AttributeError:
        preset = False

    if 'sca_roi' in resource_id:
        out_dir = os.path.join(out_dir,
            re.search('sca_ROI_(\d)+', os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))

    if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id:
        out_dir = os.path.join(out_dir,
            re.search('temp_reg_map_z_(\d)+', os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))

    if 'centrality' in resource_id:
        names = [
            'degree_centrality_binarize', 'degree_centrality_weighted',
            'eigenvector_centrality_binarize',
            'eigenvector_centrality_weighted', 'lfcd_binarize', 'lfcd_weighted'
        ]

        for name in names:
            if name in filename:
                out_dir = os.path.join(out_dir, name)
                break

    if 'tempreg_maps' in resource_id:
        out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+',
            os.path.splitext(os.path.splitext(os.path.basename(\
                model_df["Filepath"][0]))[0])[0]).group(0))

    model_path = os.path.join(out_dir, 'model_files')

    # create the actual directories
    create_dir(model_path, "group analysis output")

    # create new subject list based on which subjects are left after checking
    # for missing outputs
    new_participant_list = []
    for part in model_df["participant_id"]:
        # do this instead of using "set" just in case, to preserve order
        #   only reason there may be duplicates is because of multiple-series
        #   repeated measures runs
        if part not in new_participant_list:
            new_participant_list.append(part)

    if group_config_obj.participant_list == None:
        #participant_list = os.listdir(group_config_obj.pipeline_dir)
        new_sub_file = write_new_sub_file(model_path,
                                          group_config_obj.pipeline_dir,
                                          new_participant_list)
    else:
        new_sub_file = write_new_sub_file(model_path,
                                          group_config_obj.participant_list,
                                          new_participant_list)

    group_config_obj.update('participant_list', new_sub_file)

    num_subjects = len(list(model_df["participant_id"]))

    # start processing the dataframe further
    design_formula = group_config_obj.design_formula

    # demean EVs set for demeaning
    for demean_EV in group_config_obj.ev_selections.get("demean", []):
        model_df[demean_EV] = model_df[demean_EV].astype(float)
        model_df[demean_EV] = model_df[demean_EV].sub(
            model_df[demean_EV].mean())

    # demean the motion params
    if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula):
        params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"]
        for param in params:
            model_df[param] = model_df[param].astype(float)
            model_df[param] = model_df[param].sub(model_df[param].mean())

    # create 4D merged copefile, in the correct order, identical to design
    # matrix
    merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz"
    merge_outfile = os.path.join(model_path, merge_outfile)

    merge_file = create_merged_copefile(model_df["Filepath"].tolist(),
                                        merge_outfile)

    # create merged group mask
    merge_mask_outfile = '_'.join(
        [model_name, resource_id, "merged_mask.nii.gz"])
    merge_mask_outfile = os.path.join(model_path, merge_mask_outfile)
    merge_mask = create_merge_mask(merge_file, merge_mask_outfile)

    if "Group Mask" in group_config_obj.mean_mask:
        mask_for_means = merge_mask
    else:
        individual_masks_dir = os.path.join(model_path, "individual_masks")
        create_dir(individual_masks_dir, "individual masks")
        for unique_id, series_id, raw_filepath in zip(
                model_df["participant_id"], model_df["Series"],
                model_df["Raw_Filepath"]):
            mask_for_means_path = os.path.join(
                individual_masks_dir,
                "%s_%s_%s_mask.nii.gz" % (unique_id, series_id, resource_id))
            mask_for_means = create_merge_mask(raw_filepath,
                                               mask_for_means_path)
        readme_flags.append("individual_masks")

    # calculate measure means, and demean
    if "Measure_Mean" in design_formula:
        model_df = calculate_measure_mean_in_df(model_df, mask_for_means)

    # calculate custom ROIs, and demean (in workflow?)
    if "Custom_ROI_Mean" in design_formula:

        custom_roi_mask = group_config_obj.custom_roi_mask

        if (custom_roi_mask == None) or (custom_roi_mask == "None") or \
                (custom_roi_mask == "none") or (custom_roi_mask == ""):
            err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \
                  "formula, but you didn't supply a custom ROI mask file." \
                  "\n\nDesign formula: %s\n\n" % design_formula
            raise Exception(err)

        # make sure the custom ROI mask file is the same resolution as the
        # output files - if not, resample and warn the user
        roi_mask = check_mask_file_resolution(
            list(model_df["Raw_Filepath"])[0], custom_roi_mask, mask_for_means,
            model_path, resource_id)

        # trim the custom ROI mask to be within mask constraints
        output_mask = os.path.join(model_path, "masked_%s" \
                                   % os.path.basename(roi_mask))
        roi_mask = trim_mask(roi_mask, mask_for_means, output_mask)
        readme_flags.append("custom_roi_mask_trimmed")

        # calculate
        model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask)

        # update the design formula
        new_design_substring = ""

        for col in model_df.columns:
            if "Custom_ROI_Mean_" in str(col):
                if str(col) == "Custom_ROI_Mean_1":
                    new_design_substring = new_design_substring + " %s" % col
                else:
                    new_design_substring = new_design_substring + " + %s" % col
        design_formula = design_formula.replace("Custom_ROI_Mean",
                                                new_design_substring)

    cat_list = []
    if "categorical" in group_config_obj.ev_selections.keys():
        cat_list = group_config_obj.ev_selections["categorical"]

    # prep design for repeated measures, if applicable
    if len(group_config_obj.sessions_list) > 0:
        if "session" in model_df.columns:
            # if these columns were added by the model builder automatically
            design_formula = design_formula + " + session"
            if "session" not in cat_list:
                cat_list.append("session")

    if len(group_config_obj.series_list) > 0:
        design_formula = design_formula + " + Series"
        if "Series" not in cat_list:
            cat_list.append("Series")

    if "session" in model_df.columns:
        # if these columns were added by the model builder automatically
        for col in model_df.columns:
            # should only grab the repeated measures-designed participant_{ID}
            # columns, not the "participant_id" column!
            if "participant_" in col and "_id" not in col:
                design_formula = design_formula + " + %s" % col
                cat_list.append(col)

    # parse out the EVs in the design formula at this point in time
    #   this is essentially a list of the EVs that are to be included
    ev_list = parse_out_covariates(design_formula)

    # SPLIT GROUPS here.
    #   CURRENT PROBLEMS: was creating a few doubled-up new columns
    grp_vector = [1] * num_subjects

    if group_config_obj.group_sep:

        # check if the group_ev parameter is a list instead of a string:
        # this was added to handle the new group-level analysis presets. this
        # is the only modification that was required to the group analysis
        # workflow, and it handles cases where the group variances must be
        # modeled separately, by creating separate groups for the FSL FLAME
        # .grp file.
        #     the group_ev parameter gets sent in as a list if coming from any
        #     of the presets that deal with multiple groups- in these cases,
        #     the pheno_df/design matrix is already set up properly for the
        #     multiple groups, and we need to bypass all of the processing
        #     that usually occurs when the "modeling group variances
        #     separately" option is enabled in the group analysis config YAML
        group_ev = group_config_obj.grouping_var

        if isinstance(group_ev, list) or "," in group_ev:
            grp_vector = []

            if "," in group_ev:
                group_ev = group_ev.split(",")

            if len(group_ev) == 2:
                for x, y in zip(model_df[group_ev[0]], model_df[group_ev[1]]):
                    if x == 1:
                        grp_vector.append(1)
                    elif y == 1:
                        grp_vector.append(2)
                    else:
                        err = "\n\n[!] The two categorical covariates you " \
                              "provided as the two separate groups (in order " \
                              "to model each group's variances separately) " \
                              "either have more than 2 levels (1/0), or are " \
                              "not encoded as 1's and 0's.\n\nCovariates:\n" \
                              "{0}\n{1}\n\n".format(group_ev[0], group_ev[1])
                        raise Exception(err)

            elif len(group_ev) == 3:
                for x, y, z in zip(model_df[group_ev[0]],
                                   model_df[group_ev[1]],
                                   model_df[group_ev[2]]):
                    if x == 1:
                        grp_vector.append(1)
                    elif y == 1:
                        grp_vector.append(2)
                    elif z == 1:
                        grp_vector.append(3)
                    else:
                        err = "\n\n[!] The three categorical covariates you " \
                              "provided as the three separate groups (in order " \
                              "to model each group's variances separately) " \
                              "either have more than 2 levels (1/0), or are " \
                              "not encoded as 1's and 0's.\n\nCovariates:\n" \
                              "{0}\n{1}\n{2}\n\n".format(group_ev[0],
                                                         group_ev[1],
                                                         group_ev[2])
                        raise Exception(err)

            else:
                # we're only going to see this if someone plays around with
                # their preset or config file manually
                err = "\n\n[!] If you are seeing this message, it's because:\n" \
                      "1. You are using the group-level analysis presets\n" \
                      "2. You are running a model with multiple groups having " \
                      "their variances modeled separately (i.e. multiple " \
                      "values in the FSL FLAME .grp input file), and\n" \
                      "3. For some reason, the configuration has been set up " \
                      "in a way where CPAC currently thinks you're including " \
                      "only one group, or more than three, neither of which " \
                      "are supported.\n\nGroups provided:\n{0}" \
                      "\n\n".format(str(group_ev))
                raise Exception(err)

        else:
            # model group variances separately
            old_ev_list = ev_list

            model_df, grp_vector, ev_list, cat_list = split_groups(
                model_df, group_config_obj.grouping_var, ev_list, cat_list)

            # make the grouping variable categorical for Patsy (if we try to
            # do this automatically below, it will categorical-ize all of
            # the substrings too)
            design_formula = design_formula.replace(
                group_config_obj.grouping_var,
                "C(" + group_config_obj.grouping_var + ")")
            if group_config_obj.coding_scheme == "Sum":
                design_formula = design_formula.replace(")", ", Sum)")

            # update design formula
            rename = {}
            for old_ev in old_ev_list:
                for new_ev in ev_list:
                    if old_ev + "__FOR" in new_ev:
                        if old_ev not in rename.keys():
                            rename[old_ev] = []
                        rename[old_ev].append(new_ev)

            for old_ev in rename.keys():
                design_formula = design_formula.replace(
                    old_ev, " + ".join(rename[old_ev]))

    # prep design formula for Patsy
    design_formula = patsify_design_formula(design_formula, cat_list,
                                            group_config_obj.coding_scheme[0])

    if not preset:
        # send to Patsy
        try:
            dmatrix = patsy.dmatrix(design_formula, model_df)
            dmatrix.design_info.column_names.append(model_df["Filepath"])
            dmatrix_column_names = dmatrix.design_info.column_names
        except Exception as e:
            err = "\n\n[!] Something went wrong with processing the group model "\
                  "design matrix using the Python Patsy package. Patsy might " \
                  "not be properly installed, or there may be an issue with the "\
                  "formatting of the design matrix.\n\nDesign matrix columns: " \
                  "%s\n\nPatsy-formatted design formula: %s\n\nError details: " \
                  "%s\n\n" % (model_df.columns, design_formula, e)
            raise Exception(err)
    else:
        if 'Sessions' in model_df:
            sess_levels = list(set(list(model_df['Sessions'].values)))
            if len(sess_levels) > 1:
                sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'}
                if len(sess_levels) == 3:
                    sess_map.update({sess_levels[2]: '0'})
                new_sess = [
                    s.replace(s, sess_map[s])
                    for s in list(model_df['Sessions'].values)
                ]
                model_df['Sessions'] = new_sess
        if 'Series' in model_df:
            sess_levels = list(set(list(model_df['Series'].values)))
            if len(sess_levels) > 1:
                sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'}
                if len(sess_levels) == 3:
                    sess_map.update({sess_levels[2]: '0'})
                new_sess = [
                    s.replace(s, sess_map[s])
                    for s in list(model_df['Series'].values)
                ]
                model_df['Series'] = new_sess

        keep_cols = [x for x in model_df.columns if x in design_formula]
        dmatrix = model_df[keep_cols].astype('float')
        dmatrix_column_names = list(dmatrix.columns)

    # check the model for multicollinearity - Patsy takes care of this, but
    # just in case
    check_multicollinearity(np.array(dmatrix))

    dmat_csv_path = os.path.join(model_path, "design_matrix.csv")
    contrast_out_path = os.path.join(out_dir, "contrast.csv")

    # make sure "column_names" is in the same order as the original EV column
    # header ordering in model_df - mainly for repeated measures, to make sure
    # participants_<ID> cols are at end for clarity for users
    dmat_cols = []
    dmat_id_cols = []
    for dmat_col in dmatrix_column_names:
        if 'participant_' in dmat_col:
            dmat_id_cols.append(dmat_col)
        else:
            dmat_cols.append(dmat_col)
    column_names = dmat_cols
    dmat_id_cols = sorted(dmat_id_cols)
    column_names += dmat_id_cols

    # check to make sure there are more time points than EVs!
    if len(column_names) >= num_subjects:
        err = "\n\n################## MODEL NOT GENERATED ##################" \
              "\n\n[!] CPAC says: There are more EVs than there are " \
              "participants currently included in the model for:\n\n" \
              "Derivative: {0}\nSession: {1}\nScan: {2}\nPreproc strategy:" \
              "\n    {3}\n\n" \
              "There must be more participants than EVs in the design.\n\n" \
              "Number of participants: {4}\nNumber of EVs: {5}\n\nEV/" \
              "covariate list: {6}\n\nNote: If you specified to model group " \
              "variances separately, the amount of EVs can nearly double " \
              "once they are split along the grouping variable.\n\nIf the " \
              "number of participants is lower than the number of " \
              "participants in your group analysis inclusion list, this " \
              "may be because not every participant originally included has " \
              "an output for {7} for this scan and preprocessing strategy in " \
              "the individual-level analysis output directory.\n\nDesign " \
              "formula going in: {8}" \
              "\n\n#########################################################" \
              "\n\n".format(resource_id, session_id, series_or_repeated_label,
                            preproc_strat, num_subjects, len(column_names),
                            column_names, resource_id, design_formula)
        print(err)

    # check the merged file's order
    check_merged_file(model_df["Filepath"], merge_file)

    # we must demean the categorical regressors if the Intercept/Grand Mean
    # is included in the model, otherwise FLAME produces blank outputs
    if "Intercept" in column_names:
        cat_indices = []
        col_name_indices = dmatrix.design_info.column_name_indexes
        for col_name in col_name_indices.keys():
            if "C(" in col_name:
                cat_indices.append(int(col_name_indices[col_name]))

        # note: dmat_T is now no longer a DesignMatrix Patsy object, but only
        # an array
        dmat_T = dmatrix.transpose()

        for index in cat_indices:
            new_row = []
            for val in dmat_T[index]:
                new_row.append(val - dmat_T[index].mean())
            dmat_T[index] = new_row

        # we can go back, but we won't be the same
        dmatrix = dmat_T.transpose()
        readme_flags.append("cat_demeaned")

    dmatrix_df = pd.DataFrame(np.array(dmatrix),
                              index=model_df["participant_id"],
                              columns=dmatrix_column_names)
    cols = dmatrix_df.columns.tolist()

    # make sure "column_names" is in the same order as the original EV column
    # header ordering in model_df - mainly for repeated measures, to make sure
    # participants_<ID> cols are at end for clarity for users
    dmat_cols = []
    dmat_id_cols = []
    for dmat_col in cols:
        if 'participant_' in dmat_col:
            dmat_id_cols.append(dmat_col)
        else:
            dmat_cols.append(dmat_col)
    column_names = dmat_cols
    dmat_id_cols = sorted(dmat_id_cols)
    column_names += dmat_id_cols

    dmatrix_df = dmatrix_df[column_names]

    dmat_csv_path = os.path.join(model_path, "design_matrix.csv")

    write_design_matrix_csv(dmatrix_df, model_df["participant_id"],
                            column_names, dmat_csv_path)

    # time for contrasts
    if (group_config_obj.custom_contrasts
            == None) or (group_config_obj.contrasts == None):
        # if no custom contrasts matrix CSV provided (i.e. the user
        # specified contrasts in the GUI)
        contrasts_columns = column_names
        if group_config_obj.f_tests:
            for i in group_config_obj.f_tests[1:len(group_config_obj.f_tests) -
                                              1]:
                contrasts_columns.append('f_test_{0}'.format(i))
    else:
        pass

    contrast_out_path = os.path.join(model_dir, "contrasts.csv")

    if preset:
        cons = pd.read_csv(group_config_obj.custom_contrasts)
        with open(contrast_out_path, "w") as f:
            cons.to_csv(f, index=False)
    else:
        if os.path.isfile(contrast_out_path):
            contrasts_df = pd.read_csv(contrast_out_path)
            if contrasts_df.shape[0] > 1 or np.count_nonzero(
                    contrasts_df.values[0][1:]) > 0:
                msg = "\n\n[!] C-PAC says: It appears you have modified your " \
                      "contrasts CSV file already- back up this file before " \
                      "building your model again to avoid overwriting your " \
                      "changes.\n\nContrasts file:\n{0}" \
                      "\n\n".format(contrast_out_path)
                raise Exception(msg)

        with open(contrast_out_path, "w") as f:
            f.write('Contrasts')
            for col in contrasts_columns:
                f.write(',{0}'.format(col))
            f.write('\ncontrast_1')
            for col in contrasts_columns:
                f.write(',0')

    groups_out_path = os.path.join(model_path, 'groups.txt')
    with open(groups_out_path, 'w') as f:
        for val in grp_vector:
            f.write('{0}\n'.format(val))

    msg = 'Model successfully generated for..\nDerivative: {0}\nSession: {1}' \
          '\nScan: {2}\nPreprocessing strategy:\n    {3}\n\nModel directory:' \
          '\n{4}\n\nGroup configuration file:\n{5}\n\nContrasts template CSV:' \
          '\n{6}\n\nDefine your contrasts in this contrasts template CSV and ' \
          'save your changes, then run FSL-FEAT either using the GUI ' \
          'interface or through the command-line like so:\n\n    cpac group ' \
          'feat run <path to group config.yml>' \
           '\n'.format(resource_id, session_id, series_or_repeated_label,
                       preproc_strat, model_path, group_config_file,
                       contrast_out_path)
    print(
        '-------------------------------------------------------------------')
    print(msg)
    print(
        '-------------------------------------------------------------------')

    return dmat_csv_path, new_sub_file, contrast_out_path

コード例 #5

ファイルを表示

ファイル: cpac_ga_model_generator.py プロジェクト: spisakt/C-PAC

def prep_group_analysis_workflow(model_df, pipeline_config_path, \
    model_name, group_config_path, resource_id, preproc_strat, \
    series_or_repeated_label):

    #
    # this function runs once per derivative type and preproc strat combo
    # during group analysis
    #

    import os
    import patsy
    import numpy as np

    import nipype.pipeline.engine as pe
    import nipype.interfaces.utility as util
    import nipype.interfaces.io as nio

    from CPAC.pipeline.cpac_group_runner import load_config_yml
    from CPAC.utils.create_flame_model_files import create_flame_model_files
    from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv

    pipeline_config_obj = load_config_yml(pipeline_config_path)
    group_config_obj = load_config_yml(group_config_path)

    pipeline_ID = pipeline_config_obj.pipelineName

    # remove file names from preproc_strat
    filename = preproc_strat.split("/")[-1]
    preproc_strat = preproc_strat.replace(filename, "")
    preproc_strat = preproc_strat.lstrip("/").rstrip("/")

    # get thresholds
    z_threshold = float(group_config_obj.z_threshold[0])

    p_threshold = float(group_config_obj.p_threshold[0])

    sub_id_label = group_config_obj.participant_id_label

    ftest_list = []
    readme_flags = []

    # determine if f-tests are included or not
    custom_confile = group_config_obj.custom_contrasts

    if ((custom_confile == None) or (custom_confile == '') or \
            ("None" in custom_confile) or ("none" in custom_confile)):

        custom_confile = None

        if (len(group_config_obj.f_tests) == 0) or \
            (group_config_obj.f_tests == None):
            fTest = False
        else:
            fTest = True
            ftest_list = group_config_obj.f_tests

    else:

        if not os.path.exists(custom_confile):
            errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \
                     ".CSV file for your group model, but this file cannot " \
                     "be found. Please double-check the filepath you have " \
                     "entered.\n\nFilepath: %s\n\n" % custom_confile
            raise Exception(errmsg)

        with open(custom_confile, "r") as f:
            evs = f.readline()

        evs = evs.rstrip('\r\n').split(',')
        count_ftests = 0

        fTest = False

        for ev in evs:
            if "f_test" in ev:
                count_ftests += 1

        if count_ftests > 0:
            fTest = True

    # create path for output directory
    out_dir = os.path.join(group_config_obj.output_dir,
                           "group_analysis_results_%s" % pipeline_ID,
                           "group_model_%s" % model_name, resource_id,
                           series_or_repeated_label, preproc_strat)

    if 'sca_roi' in resource_id:
        out_dir = os.path.join(out_dir,
            re.search('sca_ROI_(\d)+',os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))

    if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id:
        out_dir = os.path.join(out_dir,
            re.search('temp_reg_map_z_(\d)+',os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))

    if 'centrality' in resource_id:
        names = ['degree_centrality_binarize', 'degree_centrality_weighted', \
                 'eigenvector_centrality_binarize',
                 'eigenvector_centrality_weighted',
                 'lfcd_binarize', 'lfcd_weighted']

        for name in names:
            if name in filename:
                out_dir = os.path.join(out_dir, name)
                break

    if 'tempreg_maps' in resource_id:
        out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+', \
            os.path.splitext(os.path.splitext(os.path.basename(\
                model_df["Filepath"][0]))[0])[0]).group(0))

    model_path = os.path.join(out_dir, 'model_files')

    second_half_out = \
        out_dir.split("group_analysis_results_%s" % pipeline_ID)[1]

    # generate working directory for this output's group analysis run
    work_dir = os.path.join(pipeline_config_obj.workingDirectory, \
        "group_analysis", second_half_out.lstrip("/"))

    log_dir = os.path.join(pipeline_config_obj.logDirectory, \
        "group_analysis", second_half_out.lstrip("/"))

    # create the actual directories
    create_dir(model_path, "group analysis output")
    create_dir(work_dir, "group analysis working")
    create_dir(log_dir, "group analysis logfile")

    # create new subject list based on which subjects are left after checking
    # for missing outputs
    new_participant_list = []
    for part in list(model_df["Participant"]):
        # do this instead of using "set" just in case, to preserve order
        #   only reason there may be duplicates is because of multiple-series
        #   repeated measures runs
        if part not in new_participant_list:
            new_participant_list.append(part)

    new_sub_file = write_new_sub_file(model_path, \
                                      group_config_obj.participant_list, \
                                      new_participant_list)

    group_config_obj.update('participant_list', new_sub_file)

    num_subjects = len(list(model_df["Participant"]))

    # start processing the dataframe further
    design_formula = group_config_obj.design_formula

    # demean EVs set for demeaning
    for demean_EV in group_config_obj.ev_selections.get("demean", []):
        model_df[demean_EV] = model_df[demean_EV].astype(float)
        model_df[demean_EV] = model_df[demean_EV].sub(
            model_df[demean_EV].mean())

    # demean the motion params
    if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula):
        params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"]
        for param in params:
            model_df[param] = model_df[param].astype(float)
            model_df[param] = model_df[param].sub(model_df[param].mean())

    # create 4D merged copefile, in the correct order, identical to design
    # matrix
    merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz"
    merge_outfile = os.path.join(model_path, merge_outfile)

    merge_file = create_merged_copefile(list(model_df["Filepath"]), \
                                        merge_outfile)

    # create merged group mask
    merge_mask_outfile = model_name + "_" + resource_id + \
                         "_merged_mask.nii.gz"
    merge_mask_outfile = os.path.join(model_path, merge_mask_outfile)
    merge_mask = create_merge_mask(merge_file, merge_mask_outfile)

    if "Group Mask" in group_config_obj.mean_mask:
        mask_for_means = merge_mask
    else:
        individual_masks_dir = os.path.join(model_path, "individual_masks")
        create_dir(individual_masks_dir, "individual masks")
        for unique_id, series_id, raw_filepath in zip(
                model_df["Participant"], model_df["Series"],
                model_df["Raw_Filepath"]):

            mask_for_means_path = os.path.join(
                individual_masks_dir,
                "%s_%s_%s_mask.nii.gz" % (unique_id, series_id, resource_id))
            mask_for_means = create_merge_mask(raw_filepath,
                                               mask_for_means_path)
        readme_flags.append("individual_masks")

    # calculate measure means, and demean
    if "Measure_Mean" in design_formula:
        model_df = calculate_measure_mean_in_df(model_df, mask_for_means)

    # calculate custom ROIs, and demean (in workflow?)
    if "Custom_ROI_Mean" in design_formula:

        custom_roi_mask = group_config_obj.custom_roi_mask

        if (custom_roi_mask == None) or (custom_roi_mask == "None") or \
            (custom_roi_mask == "none") or (custom_roi_mask == ""):
            err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \
                  "formula, but you didn't supply a custom ROI mask file." \
                  "\n\nDesign formula: %s\n\n" % design_formula
            raise Exception(err)

        # make sure the custom ROI mask file is the same resolution as the
        # output files - if not, resample and warn the user
        roi_mask = check_mask_file_resolution(list(model_df["Raw_Filepath"])[0], \
                                              custom_roi_mask, mask_for_means, \
                                              model_path, resource_id)

        # trim the custom ROI mask to be within mask constraints
        output_mask = os.path.join(model_path, "masked_%s" \
                                   % os.path.basename(roi_mask))
        roi_mask = trim_mask(roi_mask, mask_for_means, output_mask)
        readme_flags.append("custom_roi_mask_trimmed")

        # calculate
        model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask)

        # update the design formula
        new_design_substring = ""
        for col in model_df.columns:
            if "Custom_ROI_Mean_" in str(col):
                if str(col) == "Custom_ROI_Mean_1":
                    new_design_substring = new_design_substring + " %s" % col
                else:
                    new_design_substring = new_design_substring + " + %s" % col
        design_formula = design_formula.replace("Custom_ROI_Mean",
                                                new_design_substring)

    cat_list = []
    if "categorical" in group_config_obj.ev_selections.keys():
        cat_list = group_config_obj.ev_selections["categorical"]

    # prep design for repeated measures, if applicable
    if len(group_config_obj.sessions_list) > 0:
        design_formula = design_formula + " + Session"
        if "Session" not in cat_list:
            cat_list.append("Session")
    if len(group_config_obj.series_list) > 0:
        design_formula = design_formula + " + Series"
        if "Series" not in cat_list:
            cat_list.append("Series")
    for col in list(model_df.columns):
        if "participant_" in col:
            design_formula = design_formula + " + %s" % col
            cat_list.append(col)

    # parse out the EVs in the design formula at this point in time
    #   this is essentially a list of the EVs that are to be included
    ev_list = parse_out_covariates(design_formula)

    # SPLIT GROUPS here.
    #   CURRENT PROBLEMS: was creating a few doubled-up new columns
    grp_vector = [1] * num_subjects

    if group_config_obj.group_sep:

        # model group variances separately
        old_ev_list = ev_list

        model_df, grp_vector, ev_list, cat_list = split_groups(model_df, \
                                group_config_obj.grouping_var, \
                                ev_list, cat_list)

        # make the grouping variable categorical for Patsy (if we try to
        # do this automatically below, it will categorical-ize all of
        # the substrings too)
        design_formula = design_formula.replace(group_config_obj.grouping_var, \
                                  "C(" + group_config_obj.grouping_var + ")")
        if group_config_obj.coding_scheme == "Sum":
            design_formula = design_formula.replace(")", ", Sum)")

        # update design formula
        rename = {}
        for old_ev in old_ev_list:
            for new_ev in ev_list:
                if old_ev + "__FOR" in new_ev:
                    if old_ev not in rename.keys():
                        rename[old_ev] = []
                    rename[old_ev].append(new_ev)

        for old_ev in rename.keys():
            design_formula = design_formula.replace(old_ev, \
                                                   " + ".join(rename[old_ev]))

    # prep design formula for Patsy
    design_formula = patsify_design_formula(design_formula, cat_list, \
                         group_config_obj.coding_scheme[0])
    print design_formula
    # send to Patsy
    try:
        dmatrix = patsy.dmatrix(design_formula, model_df)
    except Exception as e:
        err = "\n\n[!] Something went wrong with processing the group model "\
              "design matrix using the Python Patsy package. Patsy might " \
              "not be properly installed, or there may be an issue with the "\
              "formatting of the design matrix.\n\nPatsy-formatted design " \
              "formula: %s\n\nError details: %s\n\n" \
              % (model_df.columns, design_formula, e)
        raise Exception(err)

    print dmatrix.design_info.column_names
    print dmatrix

    # check the model for multicollinearity - Patsy takes care of this, but
    # just in case
    check_multicollinearity(np.array(dmatrix))

    # prepare for final stages
    column_names = dmatrix.design_info.column_names

    # what is this for?
    design_matrix = np.array(dmatrix, dtype=np.float16)

    # check to make sure there are more time points than EVs!
    if len(column_names) >= num_subjects:
        err = "\n\n[!] CPAC says: There are more EVs than there are " \
              "participants currently included in the model for %s. There " \
              "must be more participants than EVs in the design.\n\nNumber " \
              "of participants: %d\nNumber of EVs: %d\n\nEV/covariate list: "\
              "%s\n\nNote: If you specified to model group " \
              "variances separately, the amount of EVs can nearly double " \
              "once they are split along the grouping variable.\n\n" \
              "If the number of subjects is lower than the number of " \
              "subjects in your group analysis subject list, this may be " \
              "because not every subject in the subject list has an output " \
              "for %s in the individual-level analysis output directory.\n\n"\
              % (resource_id, num_subjects, len(column_names), column_names, \
                 resource_id)
        raise Exception(err)

    # time for contrasts
    contrasts_list = None
    contrasts_vectors = None

    if ((custom_confile == None) or (custom_confile == '') or \
            ("None" in custom_confile) or ("none" in custom_confile)):

        # if no custom contrasts matrix CSV provided (i.e. the user
        # specified contrasts in the GUI)
        contrasts_list = group_config_obj.contrasts
        contrasts_vectors = create_contrasts_dict(dmatrix, contrasts_list,
                                                  resource_id)

    # check the merged file's order
    check_merged_file(model_df["Filepath"], merge_file)

    # we must demean the categorical regressors if the Intercept/Grand Mean
    # is included in the model, otherwise FLAME produces blank outputs
    if "Intercept" in column_names:

        cat_indices = []
        col_name_indices = dmatrix.design_info.column_name_indexes
        for col_name in col_name_indices.keys():
            if "C(" in col_name:
                cat_indices.append(int(col_name_indices[col_name]))

        # note: dmat_T is now no longer a DesignMatrix Patsy object, but only
        # an array
        dmat_T = dmatrix.transpose()

        for index in cat_indices:
            new_row = []
            for val in dmat_T[index]:
                new_row.append(val - dmat_T[index].mean())
            dmat_T[index] = new_row

        # we can go back, but we won't be the same
        dmatrix = dmat_T.transpose()

        readme_flags.append("cat_demeaned")

    # send off the info so the FLAME input model files can be generated!
    mat_file, grp_file, con_file, fts_file = create_flame_model_files(
        dmatrix, column_names, contrasts_vectors, contrasts_list,
        custom_confile, ftest_list, group_config_obj.group_sep, grp_vector,
        group_config_obj.coding_scheme[0], model_name, resource_id, model_path)

    dmat_csv_path = os.path.join(model_path, "design_matrix.csv")
    write_design_matrix_csv(dmatrix, model_df["Participant"], column_names,
                            dmat_csv_path)

    # workflow time
    wf_name = "%s_%s" % (resource_id, series_or_repeated_label)
    wf = pe.Workflow(name=wf_name)

    wf.base_dir = work_dir
    crash_dir = os.path.join(pipeline_config_obj.crashLogDirectory,
                             "group_analysis", model_name)

    wf.config['execution'] = {
        'hash_method': 'timestamp',
        'crashdump_dir': crash_dir
    }

    # gpa_wf
    # Creates the actual group analysis workflow
    gpa_wf = create_group_analysis(fTest, "gp_analysis_%s" % wf_name)

    gpa_wf.inputs.inputspec.merged_file = merge_file
    gpa_wf.inputs.inputspec.merge_mask = merge_mask

    gpa_wf.inputs.inputspec.z_threshold = z_threshold
    gpa_wf.inputs.inputspec.p_threshold = p_threshold
    gpa_wf.inputs.inputspec.parameters = (pipeline_config_obj.FSLDIR, 'MNI152')

    gpa_wf.inputs.inputspec.mat_file = mat_file
    gpa_wf.inputs.inputspec.con_file = con_file
    gpa_wf.inputs.inputspec.grp_file = grp_file

    if fTest:
        gpa_wf.inputs.inputspec.fts_file = fts_file

    # ds
    # Creates the datasink node for group analysis
    ds = pe.Node(nio.DataSink(), name='gpa_sink')

    #     if c.mixedScanAnalysis == True:
    #         out_dir = re.sub(r'(\w)*scan_(\w)*(\d)*(\w)*[/]', '', out_dir)

    ds.inputs.base_directory = str(out_dir)
    ds.inputs.container = ''

    ds.inputs.regexp_substitutions = [(r'(?<=rendered)(.)*[/]', '/'),
                                      (r'(?<=model_files)(.)*[/]', '/'),
                                      (r'(?<=merged)(.)*[/]', '/'),
                                      (r'(?<=stats/clusterMap)(.)*[/]', '/'),
                                      (r'(?<=stats/unthreshold)(.)*[/]', '/'),
                                      (r'(?<=stats/threshold)(.)*[/]', '/'),
                                      (r'_cluster(.)*[/]', ''),
                                      (r'_slicer(.)*[/]', ''),
                                      (r'_overlay(.)*[/]', '')]

    ########datasink connections#########
    #if fTest:
    #    wf.connect(gp_flow, 'outputspec.fts',
    #               ds, 'model_files.@0')

    #wf.connect(gp_flow, 'outputspec.mat',
    #           ds, 'model_files.@1' )
    #wf.connect(gp_flow, 'outputspec.con',
    #           ds, 'model_files.@2')
    #wf.connect(gp_flow, 'outputspec.grp',
    #           ds, 'model_files.@3')
    wf.connect(gpa_wf, 'outputspec.merged', ds, 'merged')
    wf.connect(gpa_wf, 'outputspec.zstats', ds, 'stats.unthreshold')
    wf.connect(gpa_wf, 'outputspec.zfstats', ds, 'stats.unthreshold.@01')
    wf.connect(gpa_wf, 'outputspec.fstats', ds, 'stats.unthreshold.@02')
    wf.connect(gpa_wf, 'outputspec.cluster_threshold_zf', ds,
               'stats.threshold')
    wf.connect(gpa_wf, 'outputspec.cluster_index_zf', ds, 'stats.clusterMap')
    wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt_zf', ds,
               'stats.clusterMap.@01')
    wf.connect(gpa_wf, 'outputspec.overlay_threshold_zf', ds, 'rendered')
    wf.connect(gpa_wf, 'outputspec.rendered_image_zf', ds, 'rendered.@01')
    wf.connect(gpa_wf, 'outputspec.cluster_threshold', ds,
               'stats.threshold.@01')
    wf.connect(gpa_wf, 'outputspec.cluster_index', ds, 'stats.clusterMap.@02')
    wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt', ds,
               'stats.clusterMap.@03')
    wf.connect(gpa_wf, 'outputspec.overlay_threshold', ds, 'rendered.@02')
    wf.connect(gpa_wf, 'outputspec.rendered_image', ds, 'rendered.@03')

    ######################################

    # Run the actual group analysis workflow
    wf.run()

    print "\n\nWorkflow finished for model %s\n\n" % wf_name

コード例 #6

ファイルを表示

ファイル: cpac_ga_model_generator.py プロジェクト: FCP-INDI/C-PAC

def build_feat_model(model_df, model_name, group_config_file, resource_id,
                     preproc_strat, session_id, series_or_repeated_label):
    
    #
    # this function runs once per derivative type and preproc strat combo
    # during group analysis
    #

    import os
    import patsy
    import pandas as pd
    import numpy as np

    import nipype.pipeline.engine as pe
    import nipype.interfaces.utility as util
    import nipype.interfaces.io as nio
    from CPAC.pipeline.cpac_group_runner import load_config_yml
    
    from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv, \
        write_blank_contrast_csv

    group_config_obj = load_config_yml(group_config_file)

    pipeline_ID = group_config_obj.pipeline_dir.rstrip('/').split('/')[-1]
    #sublist_txt = group_config_obj.participant_list

    #if sublist_txt == None:
    #    print ("Warning! You have not provided a subject list. CPAC will use all the subjects in pipeline directory") 
    #    sublist_txt = group_config_obj.participant_list
    #else:
    #    sublist_txt = group_config_obj.particpant_list

    # remove file names from preproc_strat
    filename = preproc_strat.split("/")[-1]
    preproc_strat = preproc_strat.replace('.nii', '').replace('.gz', '')
    preproc_strat = preproc_strat.lstrip("/").rstrip("/")

    ftest_list = []
    readme_flags = []

    # determine if f-tests are included or not
    custom_confile = group_config_obj.custom_contrasts

    if ((custom_confile is None) or (custom_confile == '') or
            ("None" in custom_confile) or ("none" in custom_confile)):
        custom_confile = None

    #    if (len(group_config_obj.f_tests) == 0) or \
    #            (group_config_obj.f_tests is None):
    #        fTest = False
    #    else:
    #        fTest = True
    #        ftest_list = group_config_obj.f_tests

    #else:
    #    if not os.path.exists(custom_confile):
    #        errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \
    #                 ".CSV file for your group model, but this file cannot " \
    #                 "be found. Please double-check the filepath you have " \
    #                 "entered.\n\nFilepath: %s\n\n" % custom_confile
    #        raise Exception(errmsg)#

    #    with open(custom_confile, "r") as f:
    #        evs = f.readline()

    #    evs = evs.rstrip('\r\n').split(',')
    #    count_ftests = 0

    #    fTest = False

    #    for ev in evs:
    #        if "f_test" in ev:
    #            count_ftests += 1

    # create path for output directory
    model_dir = os.path.join(group_config_obj.output_dir,
                             'cpac_group_analysis',
                             'FSL_FEAT',
                             '{0}'.format(pipeline_ID),
                             'group_model_{0}'.format(model_name))

    out_dir = os.path.join(model_dir,
                           resource_id,
                           session_id,
                           series_or_repeated_label,
                           preproc_strat)

    try:
        preset_contrast = group_config_obj.preset
        preset = True
    except AttributeError:
        preset = False

    if 'sca_roi' in resource_id:
        out_dir = os.path.join(out_dir,
            re.search('sca_ROI_(\d)+', os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))
            
    if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id:
        out_dir = os.path.join(out_dir,
            re.search('temp_reg_map_z_(\d)+', os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))
            
    if 'centrality' in resource_id:
        names = ['degree_centrality_binarize',
                 'degree_centrality_weighted',
                 'eigenvector_centrality_binarize',
                 'eigenvector_centrality_weighted',
                 'lfcd_binarize', 'lfcd_weighted']

        for name in names:
            if name in filename:
                out_dir = os.path.join(out_dir, name)
                break

    if 'tempreg_maps' in resource_id:
        out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+',
            os.path.splitext(os.path.splitext(os.path.basename(\
                model_df["Filepath"][0]))[0])[0]).group(0))

    model_path = os.path.join(out_dir, 'model_files')

    # create the actual directories
    create_dir(model_path, "group analysis output")

    # create new subject list based on which subjects are left after checking
    # for missing outputs
    new_participant_list = []
    for part in model_df["participant_id"]:
        # do this instead of using "set" just in case, to preserve order
        #   only reason there may be duplicates is because of multiple-series
        #   repeated measures runs
        if part not in new_participant_list:
            new_participant_list.append(part)
    
    if group_config_obj.participant_list == None:
        #participant_list = os.listdir(group_config_obj.pipeline_dir)
        new_sub_file = write_new_sub_file(model_path,
                                          group_config_obj.pipeline_dir,
                                          new_participant_list)
    else: 
        new_sub_file = write_new_sub_file(model_path,
                                      group_config_obj.participant_list,
                                      new_participant_list)

    group_config_obj.update('participant_list', new_sub_file)

    num_subjects = len(list(model_df["participant_id"]))
    
    # start processing the dataframe further
    design_formula = group_config_obj.design_formula

    # demean EVs set for demeaning
    for demean_EV in group_config_obj.ev_selections.get("demean",[]):
        model_df[demean_EV] = model_df[demean_EV].astype(float)
        model_df[demean_EV] = model_df[demean_EV].sub(model_df[demean_EV].mean())

    # demean the motion params
    if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula):
        params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"]
        for param in params:
            model_df[param] = model_df[param].astype(float)
            model_df[param] = model_df[param].sub(model_df[param].mean())

    # create 4D merged copefile, in the correct order, identical to design
    # matrix
    merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz"
    merge_outfile = os.path.join(model_path, merge_outfile)

    merge_file = create_merged_copefile(model_df["Filepath"].tolist(),
                                        merge_outfile)

    # create merged group mask
    merge_mask_outfile = '_'.join([model_name, resource_id,
                                   "merged_mask.nii.gz"])
    merge_mask_outfile = os.path.join(model_path, merge_mask_outfile)
    merge_mask = create_merge_mask(merge_file, merge_mask_outfile)

    if "Group Mask" in group_config_obj.mean_mask:
        mask_for_means = merge_mask
    else:
        individual_masks_dir = os.path.join(model_path,
                                            "individual_masks")
        create_dir(individual_masks_dir, "individual masks")
        for unique_id, series_id, raw_filepath in zip(
                model_df["participant_id"],
                model_df["Series"], model_df["Raw_Filepath"]):
            mask_for_means_path = os.path.join(individual_masks_dir,
                                               "%s_%s_%s_mask.nii.gz" % (
                                               unique_id, series_id,
                                               resource_id))
            mask_for_means = create_merge_mask(raw_filepath,
                                               mask_for_means_path)
        readme_flags.append("individual_masks")

    # calculate measure means, and demean
    if "Measure_Mean" in design_formula:
        model_df = calculate_measure_mean_in_df(model_df, mask_for_means)

    # calculate custom ROIs, and demean (in workflow?)
    if "Custom_ROI_Mean" in design_formula:

        custom_roi_mask = group_config_obj.custom_roi_mask

        if (custom_roi_mask == None) or (custom_roi_mask == "None") or \
                (custom_roi_mask == "none") or (custom_roi_mask == ""):
            err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \
                  "formula, but you didn't supply a custom ROI mask file." \
                  "\n\nDesign formula: %s\n\n" % design_formula
            raise Exception(err)

        # make sure the custom ROI mask file is the same resolution as the
        # output files - if not, resample and warn the user
        roi_mask = check_mask_file_resolution(list(model_df["Raw_Filepath"])[0],
                                              custom_roi_mask, mask_for_means,
                                              model_path, resource_id)
        

        # trim the custom ROI mask to be within mask constraints
        output_mask = os.path.join(model_path, "masked_%s" \
                                   % os.path.basename(roi_mask))
        roi_mask = trim_mask(roi_mask, mask_for_means, output_mask)
        readme_flags.append("custom_roi_mask_trimmed")

        # calculate
        model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask)

        # update the design formula
        new_design_substring = ""
        
        for col in model_df.columns:
            if "Custom_ROI_Mean_" in str(col):
                if str(col) == "Custom_ROI_Mean_1":
                    new_design_substring = new_design_substring + " %s" % col
                else:
                    new_design_substring = new_design_substring +" + %s" % col
        design_formula = design_formula.replace("Custom_ROI_Mean",
                                                new_design_substring)

    cat_list = []
    if "categorical" in group_config_obj.ev_selections.keys():
        cat_list = group_config_obj.ev_selections["categorical"]
    
    # prep design for repeated measures, if applicable
    if len(group_config_obj.sessions_list) > 0:
        if "session" in model_df.columns:
            # if these columns were added by the model builder automatically
            design_formula = design_formula + " + session"
            if "session" not in cat_list:
                cat_list.append("session")

    if len(group_config_obj.series_list) > 0:
        design_formula = design_formula + " + Series"
        if "Series" not in cat_list:
            cat_list.append("Series")

    if "session" in model_df.columns:
        # if these columns were added by the model builder automatically
        for col in model_df.columns:
            # should only grab the repeated measures-designed participant_{ID}
            # columns, not the "participant_id" column!
            if "participant_" in col and "_id" not in col:
                design_formula = design_formula + " + %s" % col
                cat_list.append(col)
    
    # parse out the EVs in the design formula at this point in time
    #   this is essentially a list of the EVs that are to be included
    ev_list = parse_out_covariates(design_formula)
    
    # SPLIT GROUPS here.
    #   CURRENT PROBLEMS: was creating a few doubled-up new columns
    grp_vector = [1] * num_subjects
    
    if group_config_obj.group_sep:

        # check if the group_ev parameter is a list instead of a string:
        # this was added to handle the new group-level analysis presets. this
        # is the only modification that was required to the group analysis
        # workflow, and it handles cases where the group variances must be
        # modeled separately, by creating separate groups for the FSL FLAME
        # .grp file.
        #     the group_ev parameter gets sent in as a list if coming from any
        #     of the presets that deal with multiple groups- in these cases,
        #     the pheno_df/design matrix is already set up properly for the
        #     multiple groups, and we need to bypass all of the processing
        #     that usually occurs when the "modeling group variances
        #     separately" option is enabled in the group analysis config YAML
        group_ev = group_config_obj.grouping_var
        
        if isinstance(group_ev, list) or "," in group_ev:
            grp_vector = []

            if "," in group_ev:
                group_ev = group_ev.split(",")

            if len(group_ev) == 2:
                for x, y in zip(model_df[group_ev[0]], model_df[group_ev[1]]):
                    if x == 1:
                        grp_vector.append(1)
                    elif y == 1:
                        grp_vector.append(2)
                    else:
                        err = "\n\n[!] The two categorical covariates you " \
                              "provided as the two separate groups (in order " \
                              "to model each group's variances separately) " \
                              "either have more than 2 levels (1/0), or are " \
                              "not encoded as 1's and 0's.\n\nCovariates:\n" \
                              "{0}\n{1}\n\n".format(group_ev[0], group_ev[1])
                        raise Exception(err)

            elif len(group_ev) == 3:
                for x, y, z in zip(model_df[group_ev[0]], model_df[group_ev[1]],
                                   model_df[group_ev[2]]):
                    if x == 1:
                        grp_vector.append(1)
                    elif y == 1:
                        grp_vector.append(2)
                    elif z == 1:
                        grp_vector.append(3)
                    else:
                        err = "\n\n[!] The three categorical covariates you " \
                              "provided as the three separate groups (in order " \
                              "to model each group's variances separately) " \
                              "either have more than 2 levels (1/0), or are " \
                              "not encoded as 1's and 0's.\n\nCovariates:\n" \
                              "{0}\n{1}\n{2}\n\n".format(group_ev[0],
                                                         group_ev[1],
                                                         group_ev[2])
                        raise Exception(err)

            else:
                # we're only going to see this if someone plays around with
                # their preset or config file manually
                err = "\n\n[!] If you are seeing this message, it's because:\n" \
                      "1. You are using the group-level analysis presets\n" \
                      "2. You are running a model with multiple groups having " \
                      "their variances modeled separately (i.e. multiple " \
                      "values in the FSL FLAME .grp input file), and\n" \
                      "3. For some reason, the configuration has been set up " \
                      "in a way where CPAC currently thinks you're including " \
                      "only one group, or more than three, neither of which " \
                      "are supported.\n\nGroups provided:\n{0}" \
                      "\n\n".format(str(group_ev))
                raise Exception(err)

        else:
            # model group variances separately
            old_ev_list = ev_list
            
            model_df, grp_vector, ev_list, cat_list = split_groups(model_df,
                                    group_config_obj.grouping_var,
                                    ev_list, cat_list)

            # make the grouping variable categorical for Patsy (if we try to
            # do this automatically below, it will categorical-ize all of
            # the substrings too)
            design_formula = design_formula.replace(group_config_obj.grouping_var,
                                      "C(" + group_config_obj.grouping_var + ")")
            if group_config_obj.coding_scheme == "Sum":
                design_formula = design_formula.replace(")", ", Sum)")

            # update design formula
            rename = {}
            for old_ev in old_ev_list:
                for new_ev in ev_list:
                    if old_ev + "__FOR" in new_ev:
                        if old_ev not in rename.keys():
                            rename[old_ev] = []
                        rename[old_ev].append(new_ev)

            for old_ev in rename.keys():
                design_formula = design_formula.replace(old_ev,
                                                        " + ".join(rename[old_ev]))

    # prep design formula for Patsy
    design_formula = patsify_design_formula(design_formula, cat_list,
                                            group_config_obj.coding_scheme[0])

    if not preset:
        # send to Patsy
        try:
            dmatrix = patsy.dmatrix(design_formula, model_df)
            dmatrix.design_info.column_names.append(model_df["Filepath"])
            dmatrix_column_names = dmatrix.design_info.column_names
        except Exception as e:
            err = "\n\n[!] Something went wrong with processing the group model "\
                  "design matrix using the Python Patsy package. Patsy might " \
                  "not be properly installed, or there may be an issue with the "\
                  "formatting of the design matrix.\n\nDesign matrix columns: " \
                  "%s\n\nPatsy-formatted design formula: %s\n\nError details: " \
                  "%s\n\n" % (model_df.columns, design_formula, e)
            raise Exception(err)
    else:
        if 'Sessions' in model_df:
            sess_levels = list(set(list(model_df['Sessions'].values)))
            if len(sess_levels) > 1:
                sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'}
                if len(sess_levels) == 3:
                    sess_map.update({sess_levels[2]: '0'})
                new_sess = [s.replace(s, sess_map[s]) for s in list(model_df['Sessions'].values)]
                model_df['Sessions'] = new_sess
        if 'Series' in model_df:
            sess_levels = list(set(list(model_df['Series'].values)))
            if len(sess_levels) > 1:
                sess_map = {sess_levels[0]: '1', sess_levels[1]: '-1'}
                if len(sess_levels) == 3:
                    sess_map.update({sess_levels[2]: '0'})
                new_sess = [s.replace(s, sess_map[s]) for s in list(model_df['Series'].values)]
                model_df['Series'] = new_sess
        
        keep_cols = [x for x in model_df.columns if x in design_formula]
        dmatrix = model_df[keep_cols].astype('float')
        dmatrix_column_names = list(dmatrix.columns)

    # check the model for multicollinearity - Patsy takes care of this, but
    # just in case
    check_multicollinearity(np.array(dmatrix))
    
    dmat_csv_path = os.path.join(model_path, "design_matrix.csv")
    contrast_out_path = os.path.join(out_dir, "contrast.csv")
    
    # make sure "column_names" is in the same order as the original EV column
    # header ordering in model_df - mainly for repeated measures, to make sure
    # participants_<ID> cols are at end for clarity for users
    dmat_cols = []
    dmat_id_cols = []
    for dmat_col in dmatrix_column_names:
        if 'participant_' in dmat_col:
            dmat_id_cols.append(dmat_col)
        else:
            dmat_cols.append(dmat_col)
    column_names = dmat_cols
    dmat_id_cols = sorted(dmat_id_cols)
    column_names += dmat_id_cols

    # check to make sure there are more time points than EVs!
    if len(column_names) >= num_subjects:
        err = "\n\n################## MODEL NOT GENERATED ##################" \
              "\n\n[!] CPAC says: There are more EVs than there are " \
              "participants currently included in the model for:\n\n" \
              "Derivative: {0}\nSession: {1}\nScan: {2}\nPreproc strategy:" \
              "\n    {3}\n\n" \
              "There must be more participants than EVs in the design.\n\n" \
              "Number of participants: {4}\nNumber of EVs: {5}\n\nEV/" \
              "covariate list: {6}\n\nNote: If you specified to model group " \
              "variances separately, the amount of EVs can nearly double " \
              "once they are split along the grouping variable.\n\nIf the " \
              "number of participants is lower than the number of " \
              "participants in your group analysis inclusion list, this " \
              "may be because not every participant originally included has " \
              "an output for {7} for this scan and preprocessing strategy in " \
              "the individual-level analysis output directory.\n\nDesign " \
              "formula going in: {8}" \
              "\n\n#########################################################" \
              "\n\n".format(resource_id, session_id, series_or_repeated_label, 
                            preproc_strat, num_subjects, len(column_names), 
                            column_names, resource_id, design_formula)
        print(err)
        
    # check the merged file's order
    check_merged_file(model_df["Filepath"], merge_file)

    # we must demean the categorical regressors if the Intercept/Grand Mean
    # is included in the model, otherwise FLAME produces blank outputs
    if "Intercept" in column_names:
        cat_indices = []
        col_name_indices = dmatrix.design_info.column_name_indexes
        for col_name in col_name_indices.keys():
            if "C(" in col_name:
                cat_indices.append(int(col_name_indices[col_name]))

        # note: dmat_T is now no longer a DesignMatrix Patsy object, but only
        # an array
        dmat_T = dmatrix.transpose()
        
        for index in cat_indices:
            new_row = []
            for val in dmat_T[index]:
                new_row.append(val - dmat_T[index].mean())
            dmat_T[index] = new_row

        # we can go back, but we won't be the same
        dmatrix = dmat_T.transpose()
        readme_flags.append("cat_demeaned")

    dmatrix_df = pd.DataFrame(np.array(dmatrix), 
                              index=model_df["participant_id"],
                              columns=dmatrix_column_names)
    cols = dmatrix_df.columns.tolist()

    # make sure "column_names" is in the same order as the original EV column
    # header ordering in model_df - mainly for repeated measures, to make sure
    # participants_<ID> cols are at end for clarity for users
    dmat_cols = []
    dmat_id_cols = []
    for dmat_col in cols:
        if 'participant_' in dmat_col:
            dmat_id_cols.append(dmat_col)
        else:
            dmat_cols.append(dmat_col)
    column_names = dmat_cols
    dmat_id_cols = sorted(dmat_id_cols)
    column_names += dmat_id_cols

    dmatrix_df = dmatrix_df[column_names]

    dmat_csv_path = os.path.join(model_path, "design_matrix.csv")

    write_design_matrix_csv(dmatrix_df, model_df["participant_id"],
                            column_names, dmat_csv_path)
    
    # time for contrasts
    if (group_config_obj.custom_contrasts == None) or (group_config_obj.contrasts == None):
        # if no custom contrasts matrix CSV provided (i.e. the user
        # specified contrasts in the GUI)
        contrasts_columns = column_names
        if group_config_obj.f_tests:
            for i in group_config_obj.f_tests[1:len(group_config_obj.f_tests)-1]:
                contrasts_columns.append('f_test_{0}'.format(i)) 
    else:
        pass

    contrast_out_path = os.path.join(model_dir, "contrasts.csv")

    if preset:
        cons = pd.read_csv(group_config_obj.custom_contrasts)
        with open(contrast_out_path, "w") as f:
            cons.to_csv(f, index=False)
    else:
        if os.path.isfile(contrast_out_path):
            contrasts_df = pd.read_csv(contrast_out_path)
            if contrasts_df.shape[0] > 1 or np.count_nonzero(contrasts_df.values[0][1:]) > 0:
                msg = "\n\n[!] C-PAC says: It appears you have modified your " \
                      "contrasts CSV file already- back up this file before " \
                      "building your model again to avoid overwriting your " \
                      "changes.\n\nContrasts file:\n{0}" \
                      "\n\n".format(contrast_out_path)
                raise Exception(msg)

        with open(contrast_out_path, "w") as f:
            f.write('Contrasts')
            for col in contrasts_columns:
                f.write(',{0}'.format(col))
            f.write('\ncontrast_1')
            for col in contrasts_columns:
                f.write(',0')

    groups_out_path = os.path.join(model_path, 'groups.txt')
    with open(groups_out_path, 'w') as f:
        for val in grp_vector:
            f.write('{0}\n'.format(val))

    msg = 'Model successfully generated for..\nDerivative: {0}\nSession: {1}' \
          '\nScan: {2}\nPreprocessing strategy:\n    {3}\n\nModel directory:' \
          '\n{4}\n\nGroup configuration file:\n{5}\n\nContrasts template CSV:' \
          '\n{6}\n\nDefine your contrasts in this contrasts template CSV and ' \
          'save your changes, then run FSL-FEAT either using the GUI ' \
          'interface or through the command-line like so:\n\n    cpac group ' \
          'feat run <path to group config.yml>' \
           '\n'.format(resource_id, session_id, series_or_repeated_label,
                       preproc_strat, model_path, group_config_file,
                       contrast_out_path)
    print('-------------------------------------------------------------------')
    print(msg)
    print('-------------------------------------------------------------------')

    return dmat_csv_path, new_sub_file, contrast_out_path

コード例 #7

ファイルを表示

def prep_group_analysis_workflow(model_df, pipeline_config_path, \
    model_name, group_config_path, resource_id, preproc_strat, \
    series_or_repeated_label):
    
    #
    # this function runs once per derivative type and preproc strat combo
    # during group analysis
    #

    import os
    import patsy
    import numpy as np

    import nipype.pipeline.engine as pe
    import nipype.interfaces.utility as util
    import nipype.interfaces.io as nio

    from CPAC.pipeline.cpac_group_runner import load_config_yml
    from CPAC.utils.create_flame_model_files import create_flame_model_files
    from CPAC.utils.create_group_analysis_info_files import write_design_matrix_csv

    pipeline_config_obj = load_config_yml(pipeline_config_path)
    group_config_obj = load_config_yml(group_config_path)

    pipeline_ID = pipeline_config_obj.pipelineName

    # remove file names from preproc_strat
    filename = preproc_strat.split("/")[-1]
    preproc_strat = preproc_strat.replace(filename,"")
    preproc_strat = preproc_strat.lstrip("/").rstrip("/")

    # get thresholds
    z_threshold = float(group_config_obj.z_threshold[0])

    p_threshold = float(group_config_obj.p_threshold[0])

    sub_id_label = group_config_obj.participant_id_label

    ftest_list = []
    readme_flags = []

    # determine if f-tests are included or not
    custom_confile = group_config_obj.custom_contrasts

    if ((custom_confile == None) or (custom_confile == '') or \
            ("None" in custom_confile) or ("none" in custom_confile)):

        custom_confile = None

        if (len(group_config_obj.f_tests) == 0) or \
            (group_config_obj.f_tests == None):
            fTest = False
        else:
            fTest = True
            ftest_list = group_config_obj.f_tests

    else:

        if not os.path.exists(custom_confile):
            errmsg = "\n[!] CPAC says: You've specified a custom contrasts " \
                     ".CSV file for your group model, but this file cannot " \
                     "be found. Please double-check the filepath you have " \
                     "entered.\n\nFilepath: %s\n\n" % custom_confile
            raise Exception(errmsg)

        with open(custom_confile,"r") as f:
            evs = f.readline()

        evs = evs.rstrip('\r\n').split(',')
        count_ftests = 0

        fTest = False

        for ev in evs:
            if "f_test" in ev:
                count_ftests += 1

        if count_ftests > 0:
            fTest = True


    # create path for output directory
    out_dir = os.path.join(group_config_obj.output_dir, \
        "group_analysis_results_%s" % pipeline_ID, \
        "group_model_%s" % model_name, resource_id, \
        series_or_repeated_label, preproc_strat)

    if 'sca_roi' in resource_id:
        out_dir = os.path.join(out_dir, \
            re.search('sca_roi_(\d)+',os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))
            
    if 'dr_tempreg_maps_zstat_files_to_standard_smooth' in resource_id:
        out_dir = os.path.join(out_dir, \
            re.search('temp_reg_map_z_(\d)+',os.path.splitext(\
                os.path.splitext(os.path.basename(\
                    model_df["Filepath"][0]))[0])[0]).group(0))
            
    if 'centrality' in resource_id:
        names = ['degree_centrality_binarize', 'degree_centrality_weighted', \
                 'eigenvector_centrality_binarize', \
                 'eigenvector_centrality_weighted', \
                 'lfcd_binarize', 'lfcd_weighted']

        for name in names:
            if name in filename:
                out_dir = os.path.join(out_dir, name)
                break

    if 'tempreg_maps' in resource_id:
        out_dir = os.path.join(out_dir, re.search('\w*[#]*\d+', \
            os.path.splitext(os.path.splitext(os.path.basename(\
                model_df["Filepath"][0]))[0])[0]).group(0))

    model_path = os.path.join(out_dir, 'model_files')

    second_half_out = \
        out_dir.split("group_analysis_results_%s" % pipeline_ID)[1]

    # generate working directory for this output's group analysis run
    work_dir = os.path.join(pipeline_config_obj.workingDirectory, \
        "group_analysis", second_half_out.lstrip("/"))

    log_dir = os.path.join(pipeline_config_obj.logDirectory, \
        "group_analysis", second_half_out.lstrip("/"))

    # create the actual directories
    create_dir(model_path, "group analysis output")
    create_dir(work_dir, "group analysis working")
    create_dir(log_dir, "group analysis logfile")


    # create new subject list based on which subjects are left after checking
    # for missing outputs
    new_participant_list = []
    for part in list(model_df["Participant"]):
        # do this instead of using "set" just in case, to preserve order
        #   only reason there may be duplicates is because of multiple-series
        #   repeated measures runs
        if part not in new_participant_list:
            new_participant_list.append(part)

    new_sub_file = write_new_sub_file(model_path, \
                                      group_config_obj.participant_list, \
                                      new_participant_list)

    group_config_obj.update('participant_list',new_sub_file)

    num_subjects = len(list(model_df["Participant"]))


    # start processing the dataframe further
    design_formula = group_config_obj.design_formula

    # demean EVs set for demeaning
    for demean_EV in group_config_obj.ev_selections["demean"]:
        model_df[demean_EV] = model_df[demean_EV].astype(float)
        model_df[demean_EV] = model_df[demean_EV].sub(model_df[demean_EV].mean())

    # demean the motion params
    if ("MeanFD" in design_formula) or ("MeanDVARS" in design_formula):
        params = ["MeanFD_Power", "MeanFD_Jenkinson", "MeanDVARS"]
        for param in params:
            model_df[param] = model_df[param].astype(float)
            model_df[param] = model_df[param].sub(model_df[param].mean())


    # create 4D merged copefile, in the correct order, identical to design
    # matrix
    merge_outfile = model_name + "_" + resource_id + "_merged.nii.gz"
    merge_outfile = os.path.join(model_path, merge_outfile)

    merge_file = create_merged_copefile(list(model_df["Filepath"]), \
                                        merge_outfile)

    # create merged group mask
    merge_mask_outfile = model_name + "_" + resource_id + \
                             "_merged_mask.nii.gz"
    merge_mask_outfile = os.path.join(model_path, merge_mask_outfile)
    merge_mask = create_merge_mask(merge_file, merge_mask_outfile)

    if "Group Mask" in group_config_obj.mean_mask:
        mask_for_means = merge_mask
    else:
        individual_masks_dir = os.path.join(model_path, "individual_masks")
        create_dir(individual_masks_dir, "individual masks")
        for unique_id, series_id, raw_filepath in zip(model_df["Participant"],
            model_df["Series"], model_df["Raw_Filepath"]):
            
            mask_for_means_path = os.path.join(individual_masks_dir,
                "%s_%s_%s_mask.nii.gz" % (unique_id, series_id, resource_id))
            mask_for_means = create_merge_mask(raw_filepath, 
                                               mask_for_means_path)
        readme_flags.append("individual_masks")

    # calculate measure means, and demean
    if "Measure_Mean" in design_formula:
        model_df = calculate_measure_mean_in_df(model_df, mask_for_means)

    # calculate custom ROIs, and demean (in workflow?)
    if "Custom_ROI_Mean" in design_formula:

        custom_roi_mask = group_config_obj.custom_roi_mask

        if (custom_roi_mask == None) or (custom_roi_mask == "None") or \
            (custom_roi_mask == "none") or (custom_roi_mask == ""):
            err = "\n\n[!] You included 'Custom_ROI_Mean' in your design " \
                  "formula, but you didn't supply a custom ROI mask file." \
                  "\n\nDesign formula: %s\n\n" % design_formula
            raise Exception(err)

        # make sure the custom ROI mask file is the same resolution as the
        # output files - if not, resample and warn the user
        roi_mask = check_mask_file_resolution(list(model_df["Raw_Filepath"])[0], \
                                              custom_roi_mask, mask_for_means, \
                                              model_path, resource_id)

        # trim the custom ROI mask to be within mask constraints
        output_mask = os.path.join(model_path, "masked_%s" \
                                   % os.path.basename(roi_mask))
        roi_mask = trim_mask(roi_mask, mask_for_means, output_mask)
        readme_flags.append("custom_roi_mask_trimmed")

        # calculate
        model_df = calculate_custom_roi_mean_in_df(model_df, roi_mask)

        # update the design formula
        new_design_substring = ""
        for col in model_df.columns:
            if "Custom_ROI_Mean_" in str(col):
                if str(col) == "Custom_ROI_Mean_1":
                    new_design_substring = new_design_substring + " %s" % col
                else:
                    new_design_substring = new_design_substring +" + %s" % col
        design_formula = design_formula.replace("Custom_ROI_Mean", \
                                                new_design_substring)


    cat_list = []
    if "categorical" in group_config_obj.ev_selections.keys():
        cat_list = group_config_obj.ev_selections["categorical"]


    # prep design for repeated measures, if applicable
    if len(group_config_obj.sessions_list) > 0:
        design_formula = design_formula + " + Session"
        if "Session" not in cat_list:
            cat_list.append("Session")
    if len(group_config_obj.series_list) > 0:
        design_formula = design_formula + " + Series"
        if "Series" not in cat_list:
            cat_list.append("Series")
    for col in list(model_df.columns):
        if "participant_" in col:
            design_formula = design_formula + " + %s" % col
            cat_list.append(col)


    # parse out the EVs in the design formula at this point in time
    #   this is essentially a list of the EVs that are to be included
    ev_list = parse_out_covariates(design_formula)


    # SPLIT GROUPS here.
    #   CURRENT PROBLEMS: was creating a few doubled-up new columns
    grp_vector = [1] * num_subjects

    if group_config_obj.group_sep:

        # model group variances separately
        old_ev_list = ev_list

        model_df, grp_vector, ev_list, cat_list = split_groups(model_df, \
                                group_config_obj.grouping_var, \
                                ev_list, cat_list)

        # make the grouping variable categorical for Patsy (if we try to
        # do this automatically below, it will categorical-ize all of 
        # the substrings too)
        design_formula = design_formula.replace(group_config_obj.grouping_var, \
                                  "C(" + group_config_obj.grouping_var + ")")
        if group_config_obj.coding_scheme == "Sum":
            design_formula = design_formula.replace(")", ", Sum)")

        # update design formula
        rename = {}
        for old_ev in old_ev_list:
            for new_ev in ev_list:
                if old_ev + "__FOR" in new_ev:
                    if old_ev not in rename.keys():
                        rename[old_ev] = []
                    rename[old_ev].append(new_ev)

        for old_ev in rename.keys():
            design_formula = design_formula.replace(old_ev, \
                                                   " + ".join(rename[old_ev]))


    # prep design formula for Patsy
    design_formula = patsify_design_formula(design_formula, cat_list, \
                         group_config_obj.coding_scheme[0])
    print design_formula
    # send to Patsy
    try:
        dmatrix = patsy.dmatrix(design_formula, model_df)
    except Exception as e:
        err = "\n\n[!] Something went wrong with processing the group model "\
              "design matrix using the Python Patsy package. Patsy might " \
              "not be properly installed, or there may be an issue with the "\
              "formatting of the design matrix.\n\nPatsy-formatted design " \
              "formula: %s\n\nError details: %s\n\n" \
              % (model_df.columns, design_formula, e)
        raise Exception(err)

    print dmatrix.design_info.column_names
    print dmatrix

    # check the model for multicollinearity - Patsy takes care of this, but
    # just in case
    check_multicollinearity(np.array(dmatrix))

    # prepare for final stages
    column_names = dmatrix.design_info.column_names

    # what is this for?
    design_matrix = np.array(dmatrix, dtype=np.float16)
    
        
    # check to make sure there are more time points than EVs!
    if len(column_names) >= num_subjects:
        err = "\n\n[!] CPAC says: There are more EVs than there are " \
              "participants currently included in the model for %s. There " \
              "must be more participants than EVs in the design.\n\nNumber " \
              "of participants: %d\nNumber of EVs: %d\n\nEV/covariate list: "\
              "%s\n\nNote: If you specified to model group " \
              "variances separately, the amount of EVs can nearly double " \
              "once they are split along the grouping variable.\n\n" \
              "If the number of subjects is lower than the number of " \
              "subjects in your group analysis subject list, this may be " \
              "because not every subject in the subject list has an output " \
              "for %s in the individual-level analysis output directory.\n\n"\
              % (resource_id, num_subjects, len(column_names), column_names, \
                 resource_id)
        raise Exception(err)

    # time for contrasts
    contrasts_dict = None

    if ((custom_confile == None) or (custom_confile == '') or \
            ("None" in custom_confile) or ("none" in custom_confile)):

        # if no custom contrasts matrix CSV provided (i.e. the user
        # specified contrasts in the GUI)
        contrasts_list = group_config_obj.contrasts
        contrasts_dict = create_contrasts_dict(dmatrix, contrasts_list,
            resource_id)

    # check the merged file's order
    check_merged_file(model_df["Filepath"], merge_file)

    # we must demean the categorical regressors if the Intercept/Grand Mean
    # is included in the model, otherwise FLAME produces blank outputs
    if "Intercept" in column_names:

        cat_indices = []
        col_name_indices = dmatrix.design_info.column_name_indexes
        for col_name in col_name_indices.keys():
            if "C(" in col_name:
                cat_indices.append(int(col_name_indices[col_name]))

        # note: dmat_T is now no longer a DesignMatrix Patsy object, but only
        # an array
        dmat_T = dmatrix.transpose()

        for index in cat_indices:
            new_row = []
            for val in dmat_T[index]:
                new_row.append(val - dmat_T[index].mean())
            dmat_T[index] = new_row

        # we can go back, but we won't be the same
        dmatrix = dmat_T.transpose()

        readme_flags.append("cat_demeaned")

    # send off the info so the FLAME input model files can be generated!
    mat_file, grp_file, con_file, fts_file = create_flame_model_files(dmatrix, \
        column_names, contrasts_dict, custom_confile, ftest_list, \
        group_config_obj.group_sep, grp_vector, group_config_obj.coding_scheme[0], \
        model_name, resource_id, model_path)

    dmat_csv_path = os.path.join(model_path, "design_matrix.csv")
    write_design_matrix_csv(dmatrix, model_df["Participant"], column_names, \
        dmat_csv_path)

    # workflow time
    wf_name = "%s_%s" % (resource_id, series_or_repeated_label)
    wf = pe.Workflow(name=wf_name)

    wf.base_dir = work_dir
    crash_dir = os.path.join(pipeline_config_obj.crashLogDirectory, \
                             "group_analysis", model_name)

    wf.config['execution'] = {'hash_method': 'timestamp', \
                              'crashdump_dir': crash_dir} 

    # gpa_wf
    # Creates the actual group analysis workflow
    gpa_wf = create_group_analysis(fTest, "gp_analysis_%s" % wf_name)

    gpa_wf.inputs.inputspec.merged_file = merge_file
    gpa_wf.inputs.inputspec.merge_mask = merge_mask

    gpa_wf.inputs.inputspec.z_threshold = z_threshold
    gpa_wf.inputs.inputspec.p_threshold = p_threshold
    gpa_wf.inputs.inputspec.parameters = (pipeline_config_obj.FSLDIR, \
                                          'MNI152')

    gpa_wf.inputs.inputspec.mat_file = mat_file
    gpa_wf.inputs.inputspec.con_file = con_file
    gpa_wf.inputs.inputspec.grp_file = grp_file

    if fTest:
        gpa_wf.inputs.inputspec.fts_file = fts_file      

    # ds
    # Creates the datasink node for group analysis
    ds = pe.Node(nio.DataSink(), name='gpa_sink')
     
    #     if c.mixedScanAnalysis == True:
    #         out_dir = re.sub(r'(\w)*scan_(\w)*(\d)*(\w)*[/]', '', out_dir)
              
    ds.inputs.base_directory = str(out_dir)
    ds.inputs.container = ''
        
    ds.inputs.regexp_substitutions = [(r'(?<=rendered)(.)*[/]','/'),
                                      (r'(?<=model_files)(.)*[/]','/'),
                                      (r'(?<=merged)(.)*[/]','/'),
                                      (r'(?<=stats/clusterMap)(.)*[/]','/'),
                                      (r'(?<=stats/unthreshold)(.)*[/]','/'),
                                      (r'(?<=stats/threshold)(.)*[/]','/'),
                                      (r'_cluster(.)*[/]',''),
                                      (r'_slicer(.)*[/]',''),
                                      (r'_overlay(.)*[/]','')]
   

    ########datasink connections#########
    #if fTest:
    #    wf.connect(gp_flow, 'outputspec.fts',
    #               ds, 'model_files.@0') 
        
    #wf.connect(gp_flow, 'outputspec.mat',
    #           ds, 'model_files.@1' )
    #wf.connect(gp_flow, 'outputspec.con',
    #           ds, 'model_files.@2')
    #wf.connect(gp_flow, 'outputspec.grp',
    #           ds, 'model_files.@3')
    wf.connect(gpa_wf, 'outputspec.merged',
               ds, 'merged')
    wf.connect(gpa_wf, 'outputspec.zstats',
               ds, 'stats.unthreshold')
    wf.connect(gpa_wf, 'outputspec.zfstats',
               ds,'stats.unthreshold.@01')
    wf.connect(gpa_wf, 'outputspec.fstats',
               ds,'stats.unthreshold.@02')
    wf.connect(gpa_wf, 'outputspec.cluster_threshold_zf',
               ds, 'stats.threshold')
    wf.connect(gpa_wf, 'outputspec.cluster_index_zf',
               ds,'stats.clusterMap')
    wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt_zf',
               ds, 'stats.clusterMap.@01')
    wf.connect(gpa_wf, 'outputspec.overlay_threshold_zf',
               ds, 'rendered')
    wf.connect(gpa_wf, 'outputspec.rendered_image_zf',
               ds, 'rendered.@01')
    wf.connect(gpa_wf, 'outputspec.cluster_threshold',
               ds,  'stats.threshold.@01')
    wf.connect(gpa_wf, 'outputspec.cluster_index',
               ds, 'stats.clusterMap.@02')
    wf.connect(gpa_wf, 'outputspec.cluster_localmax_txt',
               ds, 'stats.clusterMap.@03')
    wf.connect(gpa_wf, 'outputspec.overlay_threshold',
               ds, 'rendered.@02')
    wf.connect(gpa_wf, 'outputspec.rendered_image',
               ds, 'rendered.@03')
       
    ######################################

    # Run the actual group analysis workflow
    wf.run()

    print "\n\nWorkflow finished for model %s\n\n" % wf_name