Python Frog Exemples, ppmi_snf.structures.Frog Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : 03_mri_contributions.py Projet : netneurolab/markello_ppmisnf

def main():
    keys = [
        'cortical_thickness',
        'subcortical_volume',
        'dat_scans',
        'csf_assays',
        'behavioral_measures'
    ]

    # load processed data
    fname = op.join(directories.snf, f'scale500_deterministic.h5')
    hdf = structures.Frog(fname)
    data = [hdf.load(f'/processed/pd_{key}') for key in keys]

    # compare clustering + embedding results for ALL vs NO-MRI data
    print('=' * 80)
    print('Comparing SNF outputs with and without MRI features\n')
    all_consensus, nomri_consensus = compare_nomri_clusters(hdf)

    # generate demographic dataframe with NO-MRI cluster labels
    demographics = hdf.load('/raw/pd_demographics').reset_index()
    demographics = demographics.assign(cluster=pd.Categorical(nomri_consensus))

    # run one-way and two-way ANOVA to assess cluster discriminability
    print('\n' + '=' * 80)
    print('Testing no-MRI cluster differences in PD-ICA atrophy score\n')
    run_pdatrophy_anova(demographics)

    print('\n' + '=' * 80)
    print('Running mass-univariate ANOVA for no-MRI cluster differences\n')
    run_univariate_anova(data, demographics, run_tukey=False)

    print('\n' + '=' * 80)
    print('Running two-way ANOVA comparing SNF w/ and w/o MRI features\n')
    run_twoway_anova(data, all_consensus, nomri_consensus)

Exemple #2

0

Afficher le fichier

Fichier : 05_supplementary_results.py Projet : netneurolab/markello_ppmisnf

def main():
    # load HDF file
    fname = op.join(directories.snf, f'scale500_deterministic.h5')
    hdf = structures.Frog(fname)

    print('=' * 80)
    print('Calculating supplementary cluster demographic information\n')
    get_cluster_demographics(hdf)

    print('\n' + '=' * 80)
    print('Comparing Fereshtehnejad et al., 2017 clustering results\n')
    compare_fereshtehnejad2017(hdf)

    print('\n' + '=' * 80)
    print('Comparing alternative distance metrics\n')
    compare_alt_distance(hdf)

    print('\n' + '=' * 80)
    print('Comparing SNF results across regions of hyperparameter space\n')
    compare_hyperparameter_variation(hdf)

    print('\n' + '=' * 80)
    print('Comparing SNF results of HC & PD clustering with ground truth\n')
    fig = compare_hcpd_snf(hdf)
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'pdhc_clustering')
        utils.savefig(fname, fig)

Exemple #3

0

Afficher le fichier

Fichier : 02_snf_gridsearch.py Projet : netneurolab/markello_ppmisnf

def load_and_residualize_data(hdf, groups=None):
    """
    Loads raw data and returns residualized outputs

    Parameters
    ----------
    hdf : structures.Frog
        HDF5 file with saved data
    groups : list of str, optional
        Which groups to load. Must be in ['pd', 'hc']. If not specified all
        groups are loaded. Default: None

    Returns
    -------
    data : list of pandas.DataFrame
        Residualized data
    """

    if isinstance(hdf, str):
        hdf = structures.Frog(hdf)

    if groups is None:
        groups = ['pd', 'hc']

    raw_data = [
        pd.concat([hdf[f'/raw/{group}_{key}'] for group in groups])
        for key in KEYS
    ]
    regressors = [
        pd.concat([hdf[f'/regressors/{group}_{key}'] for group in groups])
        for key in KEYS
    ]

    proc_data = []
    for data, reg in zip(raw_data, regressors):
        resid = pd.DataFrame(stats.residualize(reg, data, normalize=False),
                             index=data.index,
                             columns=data.columns)
        proc_data.append(resid)

    return proc_data

Exemple #4

0

Afficher le fichier

Fichier : 01_prepare_snf_data.py Projet : netneurolab/markello_ppmisnf

def main():
    # N.B. this will NOT work unless you set the environmental variables
    #      $PPMI_USER and $PPMI_PASSWORD prior to running this script.
    #      these variables must be the username and password you received when
    #      registering for the PPMI. for more information on data access see:
    #      https://www.ppmi-info.org/access-data-specimens/download-data/
    pypmi.fetch_studydata('all', path=directories.ppmi, overwrite=False)

    # load demographic data and keep only individuals with PD and healthy
    # individuals. we'll use the information in this data frame to residualize
    # our data against different variables (e.g., age, gender)
    print('Loading demographics information...')
    demographics = pypmi.load_demographics(directories.ppmi) \
                        .query('diagnosis in ["pd", "hc"]') \
                        .set_index('participant')
    demographics['family_history'] = demographics['family_history'].astype(bool)

    # load all non-MRI data
    print('Loading all non-MRI data (this step may take some time)...')
    datscan = pypmi.load_datscan(directories.ppmi, measures='all')
    biospec = pypmi.load_biospecimen(directories.ppmi, measures='all')
    behavior = pypmi.load_behavior(directories.ppmi, measures='all')

    # sometimes, because of how PPMI data were collected, there are slight
    # variations in the recorded date for the same visit, resulting in scores
    # for a single visit being split across two or more rows in the dataframe
    # (i.e., one row might have MoCA scores for visit "V01" and the other has
    # UPDRS scores for visit "V01")
    # to remedy this we use pandas `DataFrame.combine_first()` method, merging
    # scores from both rows and retaining the earliest date as the "true" date
    # (dates were generally only ~1 month different and if that difference
    # makes a significant impact on our results then I quit)
    print('Wrangling non-MRI data into a usable format...')
    first = behavior.drop_duplicates(['participant', 'visit'], 'first') \
                    .reset_index(drop=True)
    last = behavior.drop_duplicates(['participant', 'visit'], 'last') \
                   .reset_index(drop=True)
    behavior = first.combine_first(last)

    # get first visit scores for non-MRI data
    datscan, dat_date = get_visit(datscan, list(demographics.index), visit='SC')
    biospec, bio_date = get_visit(biospec, list(demographics.index), visit='BL')

    # behavioral data acquisition was split across screening + baseline visits
    # so we need to take the earliest visit for each measure
    # that is, not all measures were collected at screening so we need to use
    # the baseline visit scores for those measures
    # unfortunately which visit various measures were initially collected at
    # DIFFERED for PD and HC individuals, so we need to do this separately for
    # the two groups and then merge them back together... ¯\_(ツ)_/¯
    beh, beh_dates = [], []
    for diagnosis in ['pd', 'hc']:
        participants = demographics.query(f'diagnosis == "{diagnosis}"').index
        beh_sc, beh_date = get_visit(behavior, list(participants), visit='SC')
        beh_bl, _ = get_visit(behavior, list(participants), visit='BL')
        drop = np.intersect1d(beh_sc.columns, beh_bl.columns)
        beh += [pd.merge(beh_sc, beh_bl.drop(drop, axis=1), on='participant')]
        beh_dates += [beh_date]
    behavior = pd.concat(beh, join='inner')
    beh_date = pd.concat(beh_dates, join='inner')

    # iterate through all combinations of cortical + subcortical parcellations
    # note: there's only one subcortical parcellation (we had considered doing
    # more but the number of good subcortical parcellations is...limited)
    cth_data = sorted(glob.glob(op.join(directories.parcels, '*thickness.npy')))
    vol_data = sorted(glob.glob(op.join(directories.parcels, '*volume.npy')))
    for cth, vol in itertools.product(cth_data, vol_data):

        # determine what cortical / subcortical parcellation combo we're using
        # this will determine the name of the output file
        # the specific details include the resolution of cortical parcellation
        # and the datatype of the subcortical parcellation
        (scale, ) = re.search(r'res-(\d+)', cth).groups()
        (dtype, ) = re.search(r'_hemi-both_(\S+)_', vol).groups()
        hdf = structures.Frog(op.join(directories.snf,
                                      f'scale{scale}_{dtype}.h5'))
        print(f'Loading MRI data for {op.basename(hdf.filename)}...')

        # load parcellated cortical thickness data
        ct_parc = nndata.fetch_cammoun2012(data_dir=directories.rois,
                                           verbose=0)['info']
        ct_parc = pd.read_csv(ct_parc).query(f'scale == "scale{scale}" '
                                             '& structure == "cortex"')
        ct_parc['label'] = (ct_parc['label'] + '_'
                            + ct_parc['hemisphere'].apply(str.lower))
        cortthick, cth_date = get_parcels(cth, session=1, return_date=True,
                                          parcellation=ct_parc)

        # load parcellated subcortical volume data
        sv_parc = nndata.fetch_pauli2018(data_dir=directories.rois,
                                         verbose=0)['info']
        sv_parc = pd.read_csv(sv_parc)
        subvolume, vol_date = get_parcels(vol, session=1, return_date=True,
                                          parcellation=sv_parc)

        # perform batch correction on MRI data
        # first, grab the demographics of subjects for whom we have neuro data.
        # then, remove all sites where we only have data from one subject since
        # we cannot generate batch correction parameters in these instances.
        # finally, perform the actual batch correction using `neurocombat`
        cortthick, subvolume, demo = \
            preprocess.intersect_subjects(cortthick, subvolume, demographics)
        sites, counts = np.unique(demo['site'], return_counts=True)
        demo = demo[demo['site'].isin(sites[counts > 1])]
        cortthick, subvolume, demo = \
            preprocess.intersect_subjects(cortthick, subvolume, demo)
        cortthick.iloc[:, :] = batch_correct(cortthick, demo)
        subvolume.iloc[:, :] = batch_correct(subvolume, demo)

        # only keep subjects for whom we have all datatypes
        # we preprocess HC and PD data separately because part of the process
        # involves imputation and we want to impute missing data using values
        # from each diagnostic group, separately
        data = [cortthick, subvolume, datscan, biospec, behavior]
        *data, demo = preprocess.intersect_subjects(*data, demo)
        hc_data, hc_demo = snfprep(data, demo.query('diagnosis == "hc"'))
        pd_data, pd_demo = snfprep(data, demo.query('diagnosis == "pd"'))

        # only keep features for which we have both PD and HC data
        for n, (hc_dtype, pd_dtype) in enumerate(zip(hc_data, pd_data)):
            cols = np.intersect1d(hc_dtype.columns, pd_dtype.columns)
            hc_data[n], pd_data[n] = hc_data[n][cols], pd_data[n][cols]

        # "regress out" age, gender, age x gender interactions (and total
        # estimated intracranial volume, if MRI data) from all data.
        # we also want to save all this data to disk so we can load it easily
        # in the future! do that for all the raw data, regressor matrices, and
        # processed (i.e., residualized) data
        # we do this because we don't want these sorts of things to bias our
        # initial analyses when creating the fused networks
        keys = [
            'cortical_thickness',
            'subcortical_volume',
            'dat_scans',
            'csf_assays',
            'behavioral_measures'
        ]
        dates = [cth_date, vol_date, dat_date, bio_date, beh_date]
        for grp, dataset, demo in zip(['pd', 'hc'],
                                      [pd_data, hc_data],
                                      [pd_demo, hc_demo]):
            hdf.save(demo, f'/raw/{grp}_demographics', overwrite=False)
            for n, (df, key, date) in enumerate(zip(dataset, keys, dates)):
                reg = gen_regressors(date, demo)

                # get comparative regressors / data (this is always healthy
                # inviduals -- we use them to estimate the betas for the
                # residualization process)
                comp_reg, comp_df = gen_regressors(date, hc_demo), hc_data[n]

                resid = nnstats.residualize(reg, df, comp_reg, comp_df,
                                            normalize=False)
                resid = pd.DataFrame(resid, index=df.index, columns=df.columns)

                hdf.save(df, f'/raw/{grp}_{key}', overwrite=False)
                hdf.save(reg, f'/regressors/{grp}_{key}', overwrite=False)
                hdf.save(resid, f'/processed/{grp}_{key}', overwrite=False)

Exemple #5

0

Afficher le fichier

def get_nmi_mod(method, print_summary=True):
    """
    Gets normalized mutual information and modularity for `method`

    Parameters
    ----------
    method : {'snf', 'rbf'}
        Method to use for calculating metrics
    print_summary : bool, optional
        Whether to print summary statistics (mean, SD, ranges) for generated
        metrics

    Returns
    -------
    nmi : numpy.ndarray
        Normalized mutual information
    mod : numpy.ndarray
        Modularity estimates
    """

    methods = ['snf', 'rbf']
    if method not in methods:
        raise ValueError(f'Provided `method` {method} invalid.')

    scales = [f'scale{f}' for f in ['033', '060', '125', '250', '500']]
    keys = [
        'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays',
        'behavioral_measures', 'all'
    ]

    # iterate over all CT dimensionalities and generate NMI / mod estimates
    nmi, mod = [], []
    for scale in scales:
        # get data for provided scale
        fname = op.join(directories.snf, f'{scale}_deterministic.h5')
        hdf = structures.Frog(fname)
        pd_data = [hdf.load(f'/processed/pd_{key}') for key in keys[:-1]]

        # generate affinity matrix and cluster labels
        # if we're using SNF we can just pre-load the matrices + labels
        if method == 'snf':
            path = '/snf/processed/{}/sqeuclidean/gridsearch/{}'
            affinities = [
                hdf.load(path.format(key, 'fusion_avg')) for key in keys
            ]
            labels = [hdf.load(path.format(key, 'consensus')) for key in keys]
        # otherwise, we have to generate the affinities using cosine similarity
        # and then use spectral clustering to generate the labels
        elif method == 'rbf':
            affinities = [
                metrics.pairwise.cosine_similarity(sstats.zscore(f)) + 1
                for f in pd_data
            ] + [
                metrics.pairwise.cosine_similarity(
                    sstats.zscore(np.column_stack(pd_data))) + 1
            ]
            labels = [
                spectral_clustering(aff, n_clusters=3, random_state=1234)
                for aff in affinities
            ]

        # get NMI + modularity estimates
        nmi.append(snf.metrics.nmi(labels)[-1, :-1])
        mod.append(list(gen_mod(affinities[:-1], labels[-1])))

    nmi, mod = np.asarray(nmi), np.asarray(mod)

    if print_summary:
        _print_summary(nmi, 'NMI')
        print()
        _print_summary(mod, 'modularity')
        print()

    return nmi, mod

Exemple #6

0

Afficher le fichier

def main():
    keys = [
        'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays',
        'behavioral_measures'
    ]

    # load processed data
    fname = op.join(directories.snf, f'scale500_deterministic.h5')
    hdf = structures.Frog(fname)
    data = [hdf.load(f'/processed/pd_{key}') for key in keys]

    # also load the gridsearch results back in to memory.
    # here, fusion is shape (K, M, N, N),
    #        zrand is shape (C, K, M)
    # where `K` is the nearest-neighbors parameter of SNF
    #       `M` is the scaling (mu) parameter of SNF, and
    #       `N` is PD patients
    fusion = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/fusion')
    zrand = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/zrand')
    consensus = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/consensus')

    print('=' * 80)
    print('Calculating variance explained by diffusion map embedding\n')
    mask = get_zrand_mask(zrand)
    embedding, realigned = get_embedding_variance(fusion[mask])

    print('\n' + '=' * 80)
    print('Calculating prediction model performance\n')
    run_prediction_models(hdf, feats=['pigd', 'tremor'])

    print('\n' + '=' * 80)
    print('Calculating diffusion map embedding dimension correlations\n')
    fig, corr_idxs = gen_scatterplots(data, embedding)
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'diffusion_correlations')
        utils.savefig(fname, fig)

    # load demographics information
    demographics = hdf.load('/raw/pd_demographics').reset_index()
    demographics = demographics.assign(cluster=consensus)
    pdatrophy = run_pdatrophy_anova(demographics,
                                    verbose=False,
                                    run_tukey=False)['atrophy']
    fig = gen_figure(data, embedding, realigned, consensus, pdatrophy,
                     corr_idxs)
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'diffusion_embedding')
        utils.savefig(fname, fig)

    # compare with PCA on concatenated data
    embedding = hdf['/snf/processed/all/sqeuclidean/gridsearch/embedding']
    consensus = hdf['/snf/processed/all/sqeuclidean/gridsearch/consensus']

    zdata = sstats.zscore(np.column_stack(data), ddof=1)
    u, s, v = np.linalg.svd(zdata, full_matrices=False)
    v = v.T
    pc_scores = zdata @ v

    # make figure for plot
    fig, axes = plt.subplots(2, 5, figsize=(25, 10))
    axes[0, 0].remove()
    axes[0, -1].remove()

    # first, we'll see how the pc_scores look plotted against one another
    # we'll use SNF-derived clusters to examine the distribution of patients
    axes[0, 1].scatter(pc_scores[:, 0],
                       pc_scores[:, 1],
                       c=consensus,
                       rasterized=True,
                       cmap=ListedColormap(defaults.three_cluster_cmap),
                       edgecolor=defaults.edgegray,
                       s=60,
                       linewidth=0.5)
    sns.despine(ax=axes[0, 1])
    axes[0, 1].set(xticklabels=[], yticklabels=[], xlabel='pc1', ylabel='pc2')

    # then, let's check how well pc_scores correlate with embedding scores
    corrs = efficient_pearsonr(pc_scores[:, :10], embedding)[0]
    for n, ax in enumerate(axes[0, 2:4]):
        sns.regplot(pc_scores[:, n],
                    embedding[:, n],
                    ax=ax,
                    ci=None,
                    scatter_kws=scatter_kws,
                    line_kws=line_kws)
        ax.set(xlabel=f'pc{n + 1}',
               ylabel=f'embedding dimension {n + 1}',
               xticklabels=[],
               yticklabels=[])
        sns.despine(ax=ax)
        ax.set_title(f'r = {corrs[n]:.2f}')

    for n, (ax, dt) in enumerate(zip(axes[1], data)):
        zdt = sstats.zscore(dt, ddof=1)
        u, s, v = np.linalg.svd(zdt, full_matrices=False)
        dt_scores = (zdt @ v.T)[:, 0]
        sns.regplot(dt_scores,
                    pc_scores[:, 0],
                    ax=ax,
                    ci=None,
                    scatter_kws=scatter_kws,
                    line_kws=line_kws)
        ax.set(ylabel='pc1\n(all data)' if n == 0 else '',
               xlabel=f'pc1\n({keys[n].replace("_", " ")})',
               xticklabels=[],
               yticklabels=[])
        sns.despine(ax=ax)
        ax.set_title(
            f'r = {np.corrcoef(pc_scores[:, 0], dt_scores)[0, 1]:.2f}')

    fig.tight_layout()
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'principal_components')
        utils.savefig(fname, fig)

Exemple #7

0

Afficher le fichier

Fichier : 02_pd_patient_biotypes.py Projet : netneurolab/markello_ppmisnf

def main():
    keys = [
        'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays',
        'behavioral_measures'
    ]

    # load processed data
    fname = op.join(directories.snf, f'scale500_deterministic.h5')
    hdf = structures.Frog(fname)
    data = [hdf.load(f'/processed/pd_{key}') for key in keys]

    # load demographics information
    demographics = hdf.load('/raw/pd_demographics')
    demographics['gender'] = demographics[
        'gender'].cat.remove_unused_categories()

    # also load the gridsearch results back in to memory.
    # here, labels is shape (K, M, C, N), and
    #        zrand is shape (C, K, M)
    # where `K` is the nearest-neighbors parameter of SNF
    #       `M` is the scaling (mu) parameter of SNF, and
    #       `C` is the different cluster # solutions (2, 3, & 4 clusters), and
    #       `N` is PD patients
    labels = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/labels')
    zrand = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/zrand')

    # find consensus clusters and update demographics table
    print('=' * 80)
    print('Generating consensus clustering assignments\n')
    assignments, consensus, agreement = get_consensus_clusters(labels, zrand)
    demographics = demographics.assign(cluster=pd.Categorical(consensus))

    # run all the different tests assessing the clusters
    print('\n' + '=' * 80)
    print('Testing modularity of agreement matrix for consensus clusters\n')
    run_modularity_test(agreement, consensus)

    print('\n' + '=' * 80)
    print('Testing cluster differences for confounding variables\n')
    run_confound_tests(demographics)

    print('\n' + '=' * 80)
    print('Running mass-univariate ANOVA for cluster differences\n')
    run_univariate_anova(data, demographics, run_tukey=False)

    print('\n' + '=' * 80)
    print('Testing cluster differences in PD-ICA atrophy score\n')
    run_pdatrophy_anova(demographics.reset_index())

    print('\n' + '=' * 80)
    print('Running longitudinal models for PIGD + tremor scores\n')
    run_lme(demographics, 'pigd')
    run_lme(demographics, 'tremor')

    # use all this info to generate what will serve as the basis for figure 4
    fig = gen_figure(data, demographics, agreement, consensus, assignments,
                     zrand)
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'patient_clusters')
        utils.savefig(fname, fig)

    # figures for comparing longitudinal outcomes from SNF-derived biotypes
    # with biotypes derived from other subsets of data / methodologies
    # first, start with biotypes produced from all data w/SNF
    path = '/snf/processed/{}/sqeuclidean/gridsearch/consensus'
    clusters = [
        hdf[path.format('all')], hdf[path.format('behavioral_measures')],
        spectral_clustering(metrics.pairwise.cosine_similarity(
            sstats.zscore(np.column_stack(data), ddof=1)) + 1,
                            n_clusters=3,
                            random_state=1234)
    ]
    for clust, fn in zip(clusters, ['snf', 'behavior', 'concatenate']):
        demographics = demographics.assign(cluster=pd.Categorical(clust))
        fig = supplementary_longitudinal_outcomes(demographics)
        run_lme(demographics, 'pigd')
        run_lme(demographics, 'tremor')
        if SAVE_FIGS:
            utils.savefig(op.join(directories.figs, 'supp_long', fn), fig)
            plt.close(fig=fig)

Exemple #8

0

Afficher le fichier

Fichier : 02_snf_gridsearch.py Projet : netneurolab/markello_ppmisnf

def main():
    # grab all the HDF5 files that exist
    for hdf in sorted(glob.glob(op.join(directories.snf, '*.h5'))):

        # prepare HDF file and pre-load data
        hdf = structures.Frog(hdf)
        data = [hdf.load(f'/processed/pd_{key}') for key in KEYS]

        # the only gridsearches we need to run for all the resolutions are the
        # basic one where we save out `fusion_avg` and `consensus`
        # we need ALL teh data modalities, and each data modality independently
        run_gridsearch(data=data,
                       hdf=hdf,
                       path='processed/all',
                       saveall=False,
                       metrics='sqeuclidean')
        for n, key in enumerate(KEYS):
            run_gridsearch(data=[data[n]],
                           hdf=hdf,
                           path=f'processed/{key}',
                           saveall=False,
                           metrics='sqeuclidean')

    # for the highest resolution data we want to run a BUNCH of auxiliary
    # analyses, though
    hdf = op.join(directories.snf, 'scale500_deterministic.h5')
    hdf = structures.Frog(hdf)
    data = [hdf.load(f'/processed/pd_{key}') for key in KEYS]

    # SNF for all non-MRI data
    run_gridsearch(data=data[2:],
                   hdf=hdf,
                   path='processed/nomri',
                   saveall=True,
                   metrics='sqeuclidean')

    for n, key in enumerate(KEYS):
        # SNF for all modalities except one
        run_gridsearch(data=[d for i, d in enumerate(data) if i != n],
                       hdf=hdf,
                       path=f'processed/no_{key}',
                       saveall=False,
                       metrics='sqeuclidean')

    # SNF removing "holdout" behavioral variables (all + non-MRI)
    if 'behavioral_measures' in KEYS:
        idx = KEYS.index('behavioral_measures')
        data[idx] = data[idx].drop(['tremor', 'pigd'], axis=1)
    run_gridsearch(data=data,
                   hdf=hdf,
                   path='processed/holdout/all',
                   saveall=False,
                   metrics='sqeuclidean')
    run_gridsearch(data=data[2:],
                   hdf=hdf,
                   path='processed/holdout/nomri',
                   saveall=False,
                   metrics='sqeuclidean')

    # finally, run SNF for combined HC / PD subjects
    data = load_and_residualize_data(hdf)
    run_gridsearch(data=data,
                   hdf=hdf,
                   path='processed/pdhc',
                   saveall=False,
                   metrics='sqeuclidean')

Exemple #9

0

Afficher le fichier

Fichier : 02_snf_gridsearch.py Projet : netneurolab/markello_ppmisnf

def run_gridsearch(data, hdf, path, metrics=None, saveall=True):
    """
    Runs gridsearch on `data` and saves outputs to `hdf`[`path`]

    Parameters
    ----------
    data : list of array_like
        Data on which to run SNF gridsearch
    hdf : str or structures.Frog
        Filepath to or loaded structures.Frog object to save output data
    path : str
        Will be inserted into "/snf/{path}/{metric}/gridsearch", specifying
        the path in `hdf` to which gridsearch results will be saved
    metrics : list of str, optional
        Which distance metrics SNF should be run with. If not specified will
        use ['sqeuclidean', 'cityblock', 'cosine']. Default: None
    saveall : bool, optional
        Whether to save all outputs of gridsearch (i.e., all fused matrices,
        all clustering assignments, z-rand convolved similarity matrices, AND
        consensus clustering assignments) instead of only consensus clusters,
        average fused matrix, and agreement matrix. Default: True

    Returns
    -------
    hdf :  structures.Frog
        Same as provided input but with new gridsearch results!
    """

    if metrics is None:
        metrics = ['sqeuclidean', 'cityblock', 'cosine']
    elif isinstance(metrics, str):
        metrics = [metrics]

    if isinstance(hdf, str):
        hdf = structures.Frog(hdf)

    n_subj = len(data[0])
    fname = op.basename(hdf.filename)
    print(f'Running grid-search for {fname} with {len(data)} datatypes; '
          f'saving to path "{path}"')

    # set K / mu (hyperparameters) that we'll explore (10,000 combinations)
    K = np.arange(5, 105)
    mu = np.logspace(np.log10(0.3), np.log10(10), 100)
    # only consider two-, three-, and four-cluster solutions in this space
    n_clusters = [2, 3, 4]

    for metric in metrics:
        # check that the gridsearch wasn't already run for this combination.
        # no need to repeat needless computations!
        mpath = f'/snf/{path}/{metric}/gridsearch'
        if mpath in hdf.groups():
            check = ['consensus', 'fusion_avg', 'agreement', 'embedding']
            if saveall:
                check += ['fusion', 'labels', 'zrand']
            if all(op.join(mpath, p) in hdf.keys() for p in check):
                continue

        # generate fused networks + cluster assignments for all the parameters
        print(f'Generating outputs from gridsearch with {metric} distance')
        fuse = delayed(fuse_and_label)
        gridres = Parallel(n_jobs=N_PROC)(
            fuse(data, k, m, n_clusters, metric)
            for k, m in tqdm.tqdm(list(itertools.product(K, mu))))

        # wrangle outputs from gridsearch and reshape
        fusion, labels = [np.stack(f, axis=0) for f in zip(*gridres)]
        fusion = fusion.reshape(len(K), len(mu), n_subj, n_subj)
        labels = labels.reshape(len(K), len(mu), len(n_clusters), n_subj)

        # don't parallelize zrand_convolve across cluster solutions because
        # it's already parallelizing at a lower level
        print('Convolving cluster assignments with z-Rand kernel')
        zrand_avg = [
            zrand_convolve(labels[..., n, :], n_proc=N_PROC)
            for n in range(len(n_clusters))
        ]

        # make a record of all the gridsearch outputs if desired
        if saveall:
            results = dict(fusion=fusion, labels=labels, zrand=zrand_avg)
        else:
            results = dict()

        # we'll use the 95%ile of all the z-rand scores to threshold the
        # similarity matrices and extract "stable" regions as a mask of the
        # hyperparameter space
        zrand_thr = np.percentile(zrand_avg, 95)
        mask = [cluster_img_2d(z, zrand_thr)[1] != 0 for z in zrand_avg]
        zrand_mask = np.sum(mask, axis=0) > 0

        # only keep assignments / fused networks from stable regions
        stable_idx = np.where(zrand_mask)
        labels, fusion = labels[stable_idx], fusion[stable_idx]

        # extract stable community assignments and make consensus
        comms = labels.reshape(-1, labels.shape[-1]).T
        cons, ag = cluster.find_consensus(comms,
                                          return_agreement=True,
                                          seed=1234)
        results['consensus'] = cons

        # run diffusion map embedding and generate average, aligned embedding
        embeddings = [dme(network, n_components=10) for network in fusion]
        realigned, xfms = align.iterative_alignment(embeddings, n_iters=1)
        results['embedding'] = np.mean(realigned, axis=0)

        # we'll keep the average fused network and the agreement matrix to
        # use for calculating modularity
        results['fusion_avg'] = np.mean(fusion, axis=0)
        results['agreement'] = ag

        hdf.save(results, mpath, overwrite=True)

    return hdf