Ejemplo n.º 1
0
def cronbach_alpha_scale_if_deleted(df):
    gca = pg.cronbach_alpha(df)
    result = pd.DataFrame(columns=[
        "Item", "Scale Mean if Item Deleted", "Scale Variance if Item Deleted",
        "Corrected Item-Total Correlation", "Cronbach's Alpha if Item Deleted"
    ])
    for column in df:
        sub_df = df.drop([column], axis=1)
        ac = pg.cronbach_alpha(sub_df)
        scale_mean = sub_df.mean().sum()
        variance = sub_df.sum(axis=1).var()
        pr = pearsonr(sub_df.mean(axis=1), df[column])
        result = result.append(
            {
                'Item': column,
                "Scale Mean if Item Deleted": scale_mean,
                "Scale Variance if Item Deleted": variance,
                "Corrected Item-Total Correlation": pr[0],
                "Cronbach's Alpha if Item Deleted": ac[0]
            },
            ignore_index=True)
    return [gca, result]
Ejemplo n.º 2
0
def benchmark_reproducibility(comb, modality, alg, sub_dict_clean, disc,
                              int_consist, final_missingness_summary):
    df_summary = pd.DataFrame(
        columns=['grid', 'modality', 'embedding', 'discriminability'])
    print(comb)
    df_summary.at[0, "modality"] = modality
    df_summary.at[0, "embedding"] = alg

    if modality == 'func':
        try:
            extract, hpass, model, res, atlas, smooth = comb
        except:
            print(f"Missing {comb}...")
            extract, hpass, model, res, atlas = comb
            smooth = '0'
        comb_tuple = (atlas, extract, hpass, model, res, smooth)
    else:
        directget, minlength, model, res, atlas, tol = comb
        comb_tuple = (atlas, directget, minlength, model, res, tol)

    df_summary.at[0, "grid"] = comb_tuple

    missing_sub_seshes = \
        final_missingness_summary.loc[(final_missingness_summary['alg']==alg)
                                      & (final_missingness_summary[
                                             'modality']==modality) &
                                      (final_missingness_summary[
                                           'grid']==comb_tuple)
                                      ].drop_duplicates(subset='id')

    # int_consist
    if int_consist is True and alg == 'topology':
        try:
            import pingouin as pg
        except ImportError:
            print("Cannot evaluate test-retest int_consist. pingouin"
                  " must be installed!")
        for met in mets:
            id_dict = {}
            for ID in ids:
                id_dict[ID] = {}
                for ses in sub_dict_clean[ID].keys():
                    if comb_tuple in sub_dict_clean[ID][ses][modality][
                            alg].keys():
                        id_dict[ID][ses] = \
                        sub_dict_clean[ID][ses][modality][alg][comb_tuple][
                            mets.index(met)][0]
            df_wide = pd.DataFrame(id_dict).T
            if df_wide.empty:
                del df_wide
                return pd.Series()
            df_wide = df_wide.add_prefix(f"{met}_visit_")
            df_wide.replace(0, np.nan, inplace=True)
            try:
                c_alpha = pg.cronbach_alpha(data=df_wide)
            except:
                print('FAILED...')
                print(df_wide)
                del df_wide
                return pd.Series()
            df_summary.at[0, f"cronbach_alpha_{met}"] = c_alpha[0]
            del df_wide

    # icc
    if icc is True and alg == 'topology':
        try:
            import pingouin as pg
        except ImportError:
            print("Cannot evaluate ICC. pingouin" " must be installed!")
        for met in mets:
            id_dict = {}
            dfs = []
            for ses in [str(i) for i in range(1, 11)]:
                for ID in ids:
                    id_dict[ID] = {}
                    if comb_tuple in sub_dict_clean[ID][ses][modality][
                            alg].keys():
                        id_dict[ID][ses] = \
                        sub_dict_clean[ID][ses][modality][alg][comb_tuple][
                            mets.index(met)][0]
                    df = pd.DataFrame(id_dict).T
                    if df.empty:
                        del df_long
                        return pd.Series()
                    df.columns.values[0] = f"{met}"
                    df.replace(0, np.nan, inplace=True)
                    df['id'] = df.index
                    df['ses'] = ses
                    df.reset_index(drop=True, inplace=True)
                    dfs.append(df)
            df_long = pd.concat(dfs, names=[
                'id', 'ses', f"{met}"
            ]).drop(columns=[str(i) for i in range(1, 10)])
            try:
                c_icc = pg.intraclass_corr(data=df_long,
                                           targets='id',
                                           raters='ses',
                                           ratings=f"{met}",
                                           nan_policy='omit').round(3)
                c_icc = c_icc.set_index("Type")
                df_summary.at[0, f"icc_{met}"] = pd.DataFrame(
                    c_icc.drop(
                        index=['ICC1', 'ICC2', 'ICC3'])['ICC']).mean()[0]
            except:
                print('FAILED...')
                print(df_long)
                del df_long
                return pd.Series()
            del df_long

    if disc is True:
        vect_all = []
        for ID in ids:
            try:
                out = gen_sub_vec(sub_dict_clean, ID, modality, alg,
                                  comb_tuple)
            except:
                print(f"{ID} {modality} {alg} {comb_tuple} failed...")
                continue
            # print(out)
            vect_all.append(out)
        vect_all = [
            i for i in vect_all if i is not None and not np.isnan(i).all()
        ]
        if len(vect_all) > 0:
            if alg == 'topology':
                X_top = np.swapaxes(np.hstack(vect_all), 0, 1)
                bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))]
                for m in set(bad_ixs):
                    if (X_top.shape[0] - bad_ixs.count(m)) / \
                        X_top.shape[0] < 0.50:
                        X_top = np.delete(X_top, m, axis=1)
            else:
                if len(vect_all) > 0:
                    X_top = np.array(pd.concat(vect_all, axis=0))
                else:
                    return pd.Series()
            shapes = []
            for ix, i in enumerate(vect_all):
                shapes.append(i.shape[0] * [list(ids)[ix]])
            Y = np.array(list(flatten(shapes)))
            if alg == 'topology':
                imp = IterativeImputer(max_iter=50, random_state=42)
            else:
                imp = SimpleImputer()
            X_top = imp.fit_transform(X_top)
            scaler = StandardScaler()
            X_top = scaler.fit_transform(X_top)
            try:
                discr_stat_val, rdf = discr_stat(X_top, Y)
            except:
                return pd.Series()
            df_summary.at[0, "discriminability"] = discr_stat_val
            print(discr_stat_val)
            print("\n")
            # print(rdf)
            del discr_stat_val
        del vect_all
    return df_summary
Ejemplo n.º 3
0
def main():

    #Initialize project

    #Intialize Project
    print('Project: Iris Classification', file=outfile)
    print('Author: Aakriti Sinha', file=outfile)
    print('Last run on ', datetime.now(), file=outfile)

    #------------------------------------------------------------------------------
    #Raw Data
    #Get raw dataframe
    from data.make_dataset import df_iris

    #Describe raw data
    print('\nRaw Dataset Snapshot', file=outfile)
    print(df_iris.head(), '\n', file=outfile)
    print('\nRaw Data Description', file=outfile)
    print(df_iris.describe(), '\n', file=outfile)
    print('List of categories in categorical variable', file=outfile)
    print(df_iris['species'].unique(), '\n', file=outfile)

    #------------------------------------------------------------------------------

    #Data Cleaning
    #Get tidy dataframe
    from data.clean_data import df_iris, missing_message
    print(missing_message)

    #Describe clean data
    print('\n\nClean Dataset Snapshot', file=outfile)
    print(df_iris.head(), '\n', file=outfile)
    print('\nClean Data Description', file=outfile)
    data_desc = df_iris.describe()
    print(data_desc, '\n', file=outfile)
    print('List of categories in categorical variable', file=outfile)
    cat_list = df_iris['species'].unique()
    print(cat_list, '\n', file=outfile)
    print('Distribution of categories', file=outfile)
    cat_dist = df_iris.groupby('species').count()
    print(cat_dist, file=outfile)

    #Save clean data description report
    abs_file_path = f_getFilePath("reports\\iris_clean_description.txt")
    cleandescfile = open(abs_file_path, 'w')
    print('\nClean Data Description', file=cleandescfile)
    print(data_desc, '\n', file=cleandescfile)
    print('List of categories in categorical variable', file=cleandescfile)
    print(cat_list, '\n', file=cleandescfile)
    print('Distribution of categories', file=cleandescfile)
    print(cat_dist, file=cleandescfile)

    cleandescfile.close()

    #------------------------------------------------------------------------------

    #Test power of dataset

    f_powerTest(df_iris)

    #------------------------------------------------------------------------------

    #Feature Scaling
    print('\n\nFeature Scaling: Centering, Standardizing and Normalizing',
          file=outfile)
    from data.scale_data import df_iris
    #Describe scaled data
    print('\nScaled Dataset Snapshot', file=outfile)
    print(df_iris.head(), '\n', file=outfile)
    print('\nScaled Data Description', file=outfile)
    data_desc = df_iris.describe()
    print(data_desc, '\n', file=outfile)
    print('List of categories in categorical variable', file=outfile)
    cat_list = df_iris['species'].unique()
    print(cat_list, '\n', file=outfile)
    print('Distribution of categories', file=outfile)
    cat_dist = df_iris.groupby('species').count()
    print(cat_dist, file=outfile)

    #Save scaled data description report
    abs_file_path = f_getFilePath("reports\\iris_scaled_description.txt")
    scaledescfile = open(abs_file_path, 'w')
    print('\nClean Data Description', file=scaledescfile)
    print(data_desc, '\n', file=scaledescfile)
    print('List of categories in categorical variable', file=scaledescfile)
    print(cat_list, '\n', file=scaledescfile)
    print('Distribution of categories', file=scaledescfile)
    print(cat_dist, file=scaledescfile)

    scaledescfile.close()

    #------------------------------------------------------------------------------

    #Check Correlation
    corr_csv_name = 'reports\\correlation.csv'
    corr_image_name = 'reports\\figures\\Correlation_Heatmap.png'
    correlation = f_correlation(df_iris, corr_csv_name, corr_image_name)
    #Scatterplot Matrix
    scplt_image_name = 'reports\\figures\\Scatterplot_Matrix.png'
    f_scatterplot(df_iris, scplt_image_name)

    print('\n**MULTICOLLINEARITY FOUND**', file=outfile)

    #------------------------------------------------------------------------------

    #Factor Analysis
    print('\nFACTOR ANALYSIS\n', file=outfile)

    #Testing factorability
    #        f_testFactorability(df_iris, correlation)
    from features.factor_analysis import df_iris_scores
    df_iris_scores['species'] = df_iris['species']

    #Check Correlation
    corr_csv_name = 'reports\\correlation_factors.csv'
    corr_image_name = 'reports\\figures\\Correlation_Heatmap_Factors.png'
    correlation = f_correlation(df_iris_scores, corr_csv_name, corr_image_name)
    #Scatterplot Matrix
    scplt_image_name = 'reports\\figures\\Scatterplot_Matrix_Factors.png'
    f_scatterplot(df_iris_scores, scplt_image_name)

    print(
        '\n**Factor 2 has low correlation with Species. So dropping Factor 2**\n',
        file=outfile)
    df_iris_scores.drop('Factor2', axis=1)

    #Save selected feature scores
    abs_file_path = f_getFilePath("data\\processed\\iris_scores.csv")
    df_iris_scores.to_csv(abs_file_path, index=False, encoding='utf-8')

    #Describe selected features
    print('\nSelected Features Snapshot', file=outfile)
    print(df_iris_scores.head(), '\n', file=outfile)
    print('\nSelected Features Description', file=outfile)
    data_desc = df_iris_scores.describe()
    print(data_desc, file=outfile)
    print('List of categories in categorical variable', file=outfile)
    cat_list = df_iris['species'].unique()
    print(cat_list, '\n', file=outfile)
    print('Distribution of categories', file=outfile)
    cat_dist = df_iris.groupby('species').count()
    print(cat_dist, file=outfile)

    #Save selected factors description report
    abs_file_path = f_getFilePath('reports\\iris_factors_description.txt')
    fadescfile = open(abs_file_path, 'w')
    print(data_desc, file=fadescfile)
    print('\nList of categories in categorical variable\n',
          cat_list,
          file=fadescfile)
    print('\nDistribution of categories\n', cat_dist, file=fadescfile)
    print('\nCronbach Alpha: ',
          pingouin.cronbach_alpha(df_iris_scores),
          file=fadescfile)
    fadescfile.close()

    #------------------------------------------------------------------------------

    #Model Development

    #Train-Test Split
    print(cat_list)
    print(df_iris_scores.iloc[:, :-1].head())
    print(df_iris_scores.iloc[:, -1].head())
    #        train_x, test_x, train_y, test_y = moses.train_test_split(df_iris_scores.iloc[:,:-1], df_iris_scores.iloc[:,-1], train_size=0.7, test_size=0.3, random_state = 42)

    train_x, test_x, train_y, test_y = moses.train_test_split(
        df_iris_scores.iloc[:, :-1],
        df_iris_scores.iloc[:, -1],
        train_size=0.7,
        test_size=0.3,
        random_state=42,
        stratify=df_iris_scores.iloc[:, -1])
    abs_file_path = f_getFilePath("data\\processed\\iris_train_x.csv")
    train_x.to_csv(abs_file_path, index=False, encoding='utf-8')
    abs_file_path = f_getFilePath("data\\processed\\iris_train_y.csv")
    train_y.to_csv(abs_file_path,
                   header=['species'],
                   index=False,
                   encoding='utf-8')
    abs_file_path = f_getFilePath("data\\processed\\iris_test_x.csv")
    test_x.to_csv(abs_file_path, index=False, encoding='utf-8')
    abs_file_path = f_getFilePath("data\\processed\\iris_test_y.csv")
    test_y.to_csv(abs_file_path,
                  header=['species'],
                  index=False,
                  encoding='utf-8')

    #Train model
    import models.train_model
Ejemplo n.º 4
0
def main():
    import sys
    import os
    from datetime import datetime
    from joblib import Parallel, delayed
    import tempfile
    import dill
    from pynets.statistics.utils import make_subject_dict, cleanNullTerms, \
        get_ensembles_top, get_ensembles_embedding, \
        build_grid
    from colorama import Fore, Style
    try:
        import pynets
    except ImportError:
        print(
            "PyNets not installed! Ensure that you are referencing the correct"
            " site-packages and using Python3.6+")

    if len(sys.argv) < 1:
        print("\nMissing command-line inputs! See help options with the -h"
              " flag.\n")
        sys.exit(1)

    # Parse inputs
    #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/triple'
    #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/outputs_language'
    #base_dir = '/working/hcp_test_retest'
    base_dir = '/media/dpys/data/HCP_trt'
    thr_type = "MST"
    icc = False
    disc = True
    int_consist = False
    modality = 'func'

    embedding_types = ['ASE', 'OMNI', 'betweenness', 'eigenvector']
    parcellations = ['intersection', 'language', 'ventral', 'union']
    # template = 'CN200'
    template = 'MNI152_T1'
    mets = []

    metaparams_func = [
        "parcellation", "granularity", "model", 'hpass', 'signal', 'tol'
    ]
    metaparams_dwi = [
        "parcellation", "granularity", "model", 'traversal', 'minlength', 'tol'
    ]

    #sessions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    sessions = ['1', '2']
    ####

    print(f"{Fore.LIGHTBLUE_EX}\nBenchmarking API\n")

    print(Style.RESET_ALL)

    print(f"{Fore.LIGHTGREEN_EX}Gathering sampled data...")

    print(Style.RESET_ALL)

    for embedding_type in embedding_types:
        subject_dict_file_path = (
            f"{base_dir}/pynets_subject_dict_{modality}_"
            f"{embedding_type}_{template}_{parcellations}.pkl")
        subject_mod_grids_file_path = (
            f"{base_dir}/pynets_modality_grids_{modality}_"
            f"{embedding_type}_{template}_{parcellations}.pkl")
        missingness_summary = (
            f"{base_dir}/pynets_missingness_summary_{modality}_"
            f"{embedding_type}_{template}_{parcellations}.csv")
        icc_tmps_dir = f"{base_dir}/icc_tmps/{parcellations}_{modality}_" \
                       f"{embedding_type}"
        os.makedirs(icc_tmps_dir, exist_ok=True)
        if not os.path.isfile(subject_dict_file_path):
            subject_dict, modality_grids, missingness_frames = \
                make_subject_dict(
                    [modality], base_dir, thr_type, mets, [embedding_type],
                    template, sessions, parcellations
                )
            sub_dict_clean = cleanNullTerms(subject_dict)
            missingness_frames = [
                i for i in missingness_frames if isinstance(i, pd.DataFrame)
            ]
            if len(missingness_frames) != 0:
                if len(missingness_frames) > 0:
                    if len(missingness_frames) > 1:
                        final_missingness_summary = pd.concat(
                            missingness_frames)
                        final_missingness_summary.to_csv(missingness_summary,
                                                         index=False)
                        final_missingness_summary.id = \
                            final_missingness_summary.id.astype(
                                'str').str.split('_', expand=True)[0]
                    elif len(missingness_frames) == 1:
                        final_missingness_summary = missingness_frames[0]
                        final_missingness_summary.to_csv(missingness_summary,
                                                         index=False)
                        final_missingness_summary.id = \
                            final_missingness_summary.id.astype(
                                'str').str.split('_', expand=True)[0]
                    else:
                        final_missingness_summary = pd.Series()
                else:
                    final_missingness_summary = pd.Series()
            else:
                final_missingness_summary = pd.Series()
            with open(subject_dict_file_path, "wb") as f:
                dill.dump(sub_dict_clean, f)
            f.close()
            with open(subject_mod_grids_file_path, "wb") as f:
                dill.dump(modality_grids, f)
            f.close()
        else:
            with open(subject_dict_file_path, 'rb') as f:
                sub_dict_clean = dill.load(f)
            f.close()
            with open(subject_mod_grids_file_path, "rb") as f:
                modality_grids = dill.load(f)
            f.close()
            if os.path.isfile(missingness_summary):
                final_missingness_summary = pd.read_csv(missingness_summary)
                final_missingness_summary.id = \
                    final_missingness_summary.id.astype('str').str.split(
                        '_', expand=True)[0]
            else:
                final_missingness_summary = pd.Series()
        ids = sub_dict_clean.keys()

        # print(f"MODALITY: {modality}")
        metaparams = eval(f"metaparams_{modality}")
        metaparam_dict = {}

        # print(f"EMBEDDING TYPE: {embedding_type}")
        # if os.path.isfile(f"{base_dir}/grid_clean_{modality}_{alg}.csv"):
        #     continue

        if embedding_type == 'topology':
            ensembles, df_top = get_ensembles_top(modality, thr_type,
                                                  f"{base_dir}/pynets")
        else:
            ensembles = get_ensembles_embedding(modality, embedding_type,
                                                base_dir)
        grid = build_grid(modality, metaparam_dict,
                          sorted(list(set(metaparams))), ensembles)[1]

        grid = [i for i in grid if any(n in i for n in parcellations)]

        good_grids = []
        for grid_param in grid:
            grid_finds = []
            for ID in ids:
                if ID not in sub_dict_clean.keys():
                    print(f"ID: {ID} not found...")
                    continue

                if str(sessions[0]) not in sub_dict_clean[ID].keys():
                    print(f"Session: {sessions[0]} not found for ID {ID}...")
                    continue

                if modality not in sub_dict_clean[ID][str(sessions[0])].keys():
                    print(f"Modality: {modality} not found for ID {ID}, "
                          f"ses-{sessions[0]}...")
                    continue

                if embedding_type not in \
                    sub_dict_clean[ID][str(sessions[0])][modality].keys():
                    print(f"Modality: {modality} not found for ID {ID}, "
                          f"ses-{sessions[0]}, {embedding_type}...")
                    continue

                if grid_param in \
                    list(sub_dict_clean[ID][str(sessions[0])][modality][
                             embedding_type].keys()):
                    grid_finds.append(grid_param)
            if len(grid_finds) < 0.75 * len(ids):
                print(f"Less than 75% of {grid_param} found. Removing from "
                      f"grid...")
                continue
            else:
                good_grids.append(grid_param)

        modality_grids[modality] = good_grids

        cache_dir = tempfile.mkdtemp()

        with Parallel(n_jobs=-1,
                      backend='loky',
                      verbose=10,
                      max_nbytes='200000M',
                      temp_folder=cache_dir) as parallel:
            outs = parallel(
                delayed(benchmark_reproducibility)
                (base_dir, comb, modality, embedding_type, sub_dict_clean,
                 disc, final_missingness_summary, icc_tmps_dir, icc, mets, ids,
                 template) for comb in grid)
        # outs = []
        # for comb in grid:
        #     outs.append(benchmark_reproducibility(
        #             base_dir, comb, modality, embedding_type, sub_dict_clean,
        #             disc, final_missingness_summary, icc_tmps_dir, icc,
        #             mets, ids, template
        #         ))

        df_summary = pd.concat(
            [i for i in outs if i is not None and not i.empty], axis=0)
        df_summary = df_summary.dropna(axis=0, how='all')
        print(f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_"
              f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...")
        df_summary.to_csv(
            f"{base_dir}"
            f"/grid_clean_{modality}_{embedding_type}_{parcellations}_"
            f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}"
            f".csv",
            index=False)

        # int_consist
        if int_consist is True and embedding_type == 'topology':
            try:
                import pingouin as pg
            except ImportError:
                print("Cannot evaluate test-retest int_consist. pingouin"
                      " must be installed!")

            df_summary_cronbach = pd.DataFrame(
                columns=['modality', 'embedding', 'cronbach'])
            df_summary_cronbach.at[0, "modality"] = modality
            df_summary_cronbach.at[0, "embedding"] = embedding_type

            for met in mets:
                cronbach_ses_list = []
                for ses in range(1, len(sessions)):
                    id_dict = {}
                    for ID in ids:
                        id_dict[ID] = {}
                        for comb in grid:
                            if modality == 'func':
                                try:
                                    signal, hpass, model, granularity, atlas, \
                                        tol = comb
                                except BaseException:
                                    print(f"Missing {comb}...")
                                    signal, hpass, model, granularity, atlas = comb
                                    tol = '0'
                                comb_tuple = (atlas, signal, hpass, model,
                                              granularity, tol)
                            else:
                                traversal, minlength, model, granularity, atlas, \
                                    tol = comb
                                comb_tuple = (atlas, traversal, minlength,
                                              model, granularity, tol)
                            if comb_tuple in sub_dict_clean[ID][str(
                                    ses)][modality][embedding_type].keys():
                                if isinstance(
                                        sub_dict_clean[ID][str(ses)][modality]
                                    [embedding_type][comb_tuple], np.ndarray):
                                    id_dict[ID][comb] = sub_dict_clean[ID][str(
                                        ses)][modality][embedding_type][
                                            comb_tuple][mets.index(met)][0]
                                else:
                                    continue
                            else:
                                continue
                    df_wide = pd.DataFrame(id_dict)
                    if df_wide.empty is True:
                        continue
                    else:
                        df_wide = df_wide.add_prefix(f"{met}_comb_")
                        df_wide.replace(0, np.nan, inplace=True)
                        print(df_wide)
                    try:
                        c_alpha = pg.cronbach_alpha(data=df_wide.dropna(
                            axis=1, how='all'),
                                                    nan_policy='listwise')
                        cronbach_ses_list.append(c_alpha[0])
                    except BaseException:
                        print('FAILED...')
                        print(df_wide)
                        del df_wide
                    del df_wide
                df_summary_cronbach.at[0, f"average_cronbach_{met}"] = \
                    np.nanmean(cronbach_ses_list)
            print(f"Saving to {base_dir}/grid_clean_{modality}_"
                  f"{embedding_type}_cronbach_"
                  f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...")
            df_summary_cronbach.to_csv(
                f"{base_dir}/grid_clean_{modality}_"
                f"{embedding_type}_cronbach"
                f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}"
                f".csv",
                index=False)

    return
Ejemplo n.º 5
0
def main():
    import sys
    import os
    from datetime import datetime
    from joblib import Parallel, delayed
    import tempfile
    import dill
    from pynets.stats.utils import make_subject_dict, cleanNullTerms, \
        get_ensembles_top, get_ensembles_embedding, \
        build_grid
    from colorama import Fore, Style
    try:
        import pynets
    except ImportError:
        print(
            "PyNets not installed! Ensure that you are referencing the correct"
            " site-packages and using Python3.6+")

    if len(sys.argv) < 1:
        print("\nMissing command-line inputs! See help options with the -h"
              " flag.\n")
        sys.exit(1)

    #### Parse inputs
    base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/archives/triple_network'
    #base_dir = '/scratch/04171/dpisner/HNU/HNU_outs/outputs_language'
    thr_type = "MST"
    icc = True
    disc = True
    int_consist = False
    modality = 'func'

    embedding_types = ['betweenness']
    #rsns = ['language']
    rsns = ['triple', 'kmeans']
    template = 'CN200'
    # template = 'MNI152_T1'
    mets = [
        "global_efficiency", "average_shortest_path_length",
        "degree_assortativity_coefficient", "average_betweenness_centrality",
        "average_eigenvector_centrality", "smallworldness", "modularity"
    ]

    metaparams_func = ["rsn", "res", "model", 'hpass', 'extract', 'smooth']
    metaparams_dwi = ["rsn", "res", "model", 'directget', 'minlength', 'tol']

    sessions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    ####

    print(f"{Fore.LIGHTBLUE_EX}\nBenchmarking API\n")

    print(Style.RESET_ALL)

    print(f"{Fore.LIGHTGREEN_EX}Gathering sampled data...")

    print(Style.RESET_ALL)

    for embedding_type in embedding_types:
        subject_dict_file_path = (f"{base_dir}/pynets_subject_dict_{modality}_"
                                  f"{embedding_type}_{template}.pkl")
        subject_mod_grids_file_path = (
            f"{base_dir}/pynets_modality_grids_{modality}_"
            f"{embedding_type}_{template}.pkl")
        missingness_summary = (
            f"{base_dir}/pynets_missingness_summary_{modality}_"
            f"{embedding_type}_{template}.csv")
        icc_tmps_dir = f"{base_dir}/icc_tmps/{modality}_" \
                       f"{embedding_type}"
        os.makedirs(icc_tmps_dir, exist_ok=True)
        if not os.path.isfile(subject_dict_file_path):
            subject_dict, modality_grids, missingness_frames = make_subject_dict(
                [modality], base_dir, thr_type, mets, [embedding_type],
                template, sessions, rsns)
            sub_dict_clean = cleanNullTerms(subject_dict)
            missingness_frames = [
                i for i in missingness_frames if isinstance(i, pd.DataFrame)
            ]
            if len(missingness_frames) != 0:
                if len(missingness_frames) > 0:
                    if len(missingness_frames) > 1:
                        final_missingness_summary = pd.concat(
                            missingness_frames)
                        final_missingness_summary.to_csv(missingness_summary,
                                                         index=False)
                        final_missingness_summary.id = final_missingness_summary.id.astype(
                            'str').str.split('_', expand=True)[0]
                    elif len(missingness_frames) == 1:
                        final_missingness_summary = missingness_frames[0]
                        final_missingness_summary.to_csv(missingness_summary,
                                                         index=False)
                        final_missingness_summary.id = final_missingness_summary.id.astype(
                            'str').str.split('_', expand=True)[0]
                    else:
                        final_missingness_summary = pd.Series()
                else:
                    final_missingness_summary = pd.Series()
            else:
                final_missingness_summary = pd.Series()
            with open(subject_dict_file_path, "wb") as f:
                dill.dump(sub_dict_clean, f)
            f.close()
            with open(subject_mod_grids_file_path, "wb") as f:
                dill.dump(modality_grids, f)
            f.close()
        else:
            with open(subject_dict_file_path, 'rb') as f:
                sub_dict_clean = dill.load(f)
            f.close()
            with open(subject_mod_grids_file_path, "rb") as f:
                modality_grids = dill.load(f)
            f.close()
            if os.path.isfile(missingness_summary):
                final_missingness_summary = pd.read_csv(missingness_summary)
                final_missingness_summary.id = final_missingness_summary.id.astype(
                    'str').str.split('_', expand=True)[0]
            else:
                final_missingness_summary = pd.Series()
        ids = sub_dict_clean.keys()

        # print(f"MODALITY: {modality}")
        metaparams = eval(f"metaparams_{modality}")
        metaparam_dict = {}

        # print(f"EMBEDDING TYPE: {embedding_type}")
        # if os.path.isfile(f"{base_dir}/grid_clean_{modality}_{alg}.csv"):
        #     continue

        if embedding_type == 'topology':
            ensembles, df_top = get_ensembles_top(modality, thr_type,
                                                  f"{base_dir}/pynets")
        else:
            ensembles = get_ensembles_embedding(modality, embedding_type,
                                                base_dir)
        grid = build_grid(modality, metaparam_dict,
                          sorted(list(set(metaparams))), ensembles)[1]

        grid = [
            i for i in grid if '200' not in i and '400' not in i
            and '600' not in i and '800' not in i
        ]

        if modality == "func":
            modality_grids[modality] = grid
        else:
            modality_grids[modality] = grid

        cache_dir = tempfile.mkdtemp()

        with Parallel(n_jobs=-1,
                      require="sharedmem",
                      backend='threading',
                      verbose=10,
                      max_nbytes='200000M',
                      temp_folder=cache_dir) as parallel:
            outs = parallel(
                delayed(benchmark_reproducibility)
                (base_dir, comb, modality, embedding_type, sub_dict_clean,
                 disc, final_missingness_summary, icc_tmps_dir, icc, mets, ids,
                 template) for comb in grid)
        # outs = []
        # for comb in grid:
        #     outs.append(benchmark_reproducibility(base_dir, comb, modality, embedding_type, sub_dict_clean,
        #             disc, final_missingness_summary, icc_tmps_dir, icc,
        #             mets, ids))

        df_summary = pd.concat(
            [i for i in outs if i is not None and not i.empty], axis=0)
        df_summary = df_summary.dropna(axis=0, how='all')
        print(f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_"
              f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...")
        df_summary.to_csv(
            f"{base_dir}"
            f"/grid_clean_{modality}_{embedding_type}_"
            f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv",
            index=False)

        # int_consist
        if int_consist is True and embedding_type == 'topology':
            try:
                import pingouin as pg
            except ImportError:
                print("Cannot evaluate test-retest int_consist. pingouin"
                      " must be installed!")

            df_summary_cronbach = pd.DataFrame(
                columns=['modality', 'embedding', 'cronbach'])
            df_summary_cronbach.at[0, "modality"] = modality
            df_summary_cronbach.at[0, "embedding"] = embedding_type

            for met in mets:
                cronbach_ses_list = []
                for ses in range(1, 10):
                    id_dict = {}
                    for ID in ids:
                        id_dict[ID] = {}
                        for comb in grid:
                            if modality == 'func':
                                try:
                                    extract, hpass, model, res, atlas, smooth = comb
                                except BaseException:
                                    print(f"Missing {comb}...")
                                    extract, hpass, model, res, atlas = comb
                                    smooth = '0'
                                comb_tuple = (atlas, extract, hpass, model,
                                              res, smooth)
                            else:
                                directget, minlength, model, res, atlas, tol = comb
                                comb_tuple = (atlas, directget, minlength,
                                              model, res, tol)
                            if comb_tuple in sub_dict_clean[ID][str(
                                    ses)][modality][embedding_type].keys():
                                if isinstance(
                                        sub_dict_clean[ID][str(ses)][modality]
                                    [embedding_type][comb_tuple], np.ndarray):
                                    id_dict[ID][comb] = sub_dict_clean[ID][str(
                                        ses)][modality][embedding_type][
                                            comb_tuple][mets.index(met)][0]
                                else:
                                    continue
                            else:
                                continue
                    df_wide = pd.DataFrame(id_dict)
                    if df_wide.empty is True:
                        continue
                    else:
                        df_wide = df_wide.add_prefix(f"{met}_comb_")
                        df_wide.replace(0, np.nan, inplace=True)
                        print(df_wide)
                    try:
                        c_alpha = pg.cronbach_alpha(data=df_wide.dropna(
                            axis=1, how='all'),
                                                    nan_policy='listwise')
                        cronbach_ses_list.append(c_alpha[0])
                    except BaseException:
                        print('FAILED...')
                        print(df_wide)
                        del df_wide
                    del df_wide
                df_summary_cronbach.at[0,
                                       f"average_cronbach_{met}"] = np.nanmean(
                                           cronbach_ses_list)
            print(
                f"Saving to {base_dir}/grid_clean_{modality}_{embedding_type}_cronbach_"
                f"{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv...")
            df_summary_cronbach.to_csv(
                f"{base_dir}/grid_clean_{modality}_{embedding_type}_cronbach{datetime.today().strftime('%Y-%m-%d-%H:%M:%S')}.csv",
                index=False)

    return
Ejemplo n.º 6
0
fa.fit(x)
loads = fa.loadings_
pd.DataFrame.from_records(loads)
#A, P, BI, O

# In[ ]:

#cronbach - new method

#Create the factors
factor1 = x[['A1', 'A2', 'A3', 'A4']]
factor2 = x[['P1', 'P2', 'P3', 'P4']]
factor3 = x[['BI1', 'BI2', 'BI3', 'BI4']]
factor4 = x[['O1', 'O2', 'O3']]
#Get cronbach alpha
factor1_alpha = pg.cronbach_alpha(factor1)
factor2_alpha = pg.cronbach_alpha(factor2)
factor3_alpha = pg.cronbach_alpha(factor3)
factor4_alpha = pg.cronbach_alpha(factor4)
print(factor1_alpha, factor2_alpha, factor3_alpha, factor4_alpha)

#the alphas evaluated are 0.84, 0.68, 0.86, 0.65)

# # Covariance: ANCOVA
# https://www.statology.org/ancova-python/

# # Hypothesis Testing using Pearson

# In[ ]: