Example #1
0
def calc_pval(x, y, nulls):
    """
    Calculates p-values for simulations in `x` and `y` using `spatnull`

    Parameters
    ----------
    {x, y} : (N,) array_like
        Simulated GRF brain maps
    nulls : (N, P) array_like
        Null versions of `y` GRF brain map

    Returns
    -------
    pval : float
        P-value of correlation for `x` and `y` against `nulls`
    perms : np.ndarray
        Correlations of `x` with `nulls`
    """

    x, y, nulls = np.asanyarray(x), np.asanyarray(y), np.asanyarray(nulls)

    # calculate real + permuted correlation coefficients
    real = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0]
    perms = nnstats.efficient_pearsonr(x, nulls, nan_policy='omit')[0]
    pval = (np.sum(np.abs(perms) >= np.abs(real)) + 1) / (len(perms) + 1)

    return pval, perms
def get_runtime(parcellation, scale, spatnull):
    """
    Runs spatial null models for given combination of inputs

    Parameters
    ----------
    parcellation : str
        Name of parcellation to be used
    scale : str
        Scale of `parcellation` to be used
    spatnull : str
        Name of spin method to be used
    """

    # filenames (for I/O)
    fn = SPDIR / parcellation / spatnull / f'{scale}_spins.csv'

    # load simulated data
    alphadir = SIMDIR / ALPHA
    if parcellation == 'vertex':
        x, y = simnulls.load_vertex_data(alphadir, sim=0)
    else:
        x, y = simnulls.load_parc_data(alphadir, parcellation, scale, sim=0)

    # start timer (after loading data--accounds for diff b/w vertex/parc)
    start = time.time()

    # calculate the null p-values
    if spatnull == 'naive-para':
        nnstats.efficient_pearsonr(x, y, nan_policy='omit')[1]
        nulls = None
    elif spatnull == 'naive-nonpara':
        nulls = naive_nonpara(y, fn=fn)
    elif spatnull == 'vazquez-rodriguez':
        nulls = vazquez_rodriguez(y, parcellation, scale, fn=fn)
    elif spatnull == 'vasa':
        nulls = vasa(y, parcellation, scale, fn=fn)
    elif spatnull == 'hungarian':
        nulls = hungarian(y, parcellation, scale, fn=fn)
    elif spatnull == 'cornblath':
        fn = SPDIR / 'vertex' / 'vazquez-rodriguez' / 'fsaverage5_spins.csv'
        nulls = cornblath(y, parcellation, scale, fn=fn)
    elif spatnull == 'baum':
        nulls = baum(y, parcellation, scale, fn=fn)
    elif spatnull in ('burt2018', 'burt2020', 'moran'):
        nulls = make_surrogates(y, parcellation, scale, spatnull, fn=fn)
    else:
        raise ValueError(f'Invalid spatnull: {spatnull}')

    if nulls is not None:
        simnulls.calc_pval(x, y, nulls)

    end = time.time()
    ct = CompTime(parcellation, scale, spatnull, end - start)
    print(ct)

    return asdict(ct)
Example #3
0
def combine_nulls(parcellation, scale, spatnull, alpha):
    """
    Combines outputs of all simulations into single files for provided inputs

    Parameters
    ----------
    parcellation : str
        Name of parcellation to be used
    scale : str
        Scale of `parcellation` to be used
    spatnull : str
        Name of spin method to be used
    alpha : float
        Spatial autocorrelation parameter to be used
    """

    print(f'{spatnull} {alpha} {parcellation} {scale}')

    nulldir = SIMDIR / alpha / parcellation / 'nulls' / spatnull
    pvals_fn = nulldir / f'{scale}_nulls.csv'
    perms_fn = nulldir / f'{scale}_perms.csv'

    # only some of the spatial null models were run in serial mode; these are
    # the ones that are missing the top-level file and whose outputs we need to
    # combine. do that here.
    if not pvals_fn.exists():
        pvals, perms = np.zeros(N_SIM), np.zeros((N_PERM, N_SIM))
        for sim in range(N_SIM):
            pvals[sim] = \
                np.loadtxt(nulldir / 'pvals' / f'{scale}_nulls_{sim:04d}.csv')
            perms[:, sim] = \
                np.loadtxt(nulldir / 'pvals' / f'{scale}_perms_{sim:04d}.csv')
        putils.save_dir(pvals_fn, pvals, overwrite=False)
        putils.save_dir(perms_fn, perms, overwrite=False)
    else:
        pvals = np.loadtxt(pvals_fn)

    # grab the empirical correlations for each simulation---good to have
    if parcellation == 'vertex':
        x, y = simnulls.load_vertex_data(SIMDIR / alpha, n_sim=N_SIM)
    else:
        x, y = simnulls.load_parc_data(SIMDIR / alpha,
                                       parcellation,
                                       scale,
                                       n_sim=N_SIM)
    corrs = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0]

    return pd.DataFrame(
        dict(parcellation=parcellation,
             scale=scale,
             spatnull=spatnull,
             alpha=alpha,
             corr=corrs,
             sim=range(len(pvals)),
             pval=pvals))
Example #4
0
def pval_by_subsets(parcellation, scale, spatnull, alpha):
    """
    Parameters
    ----------
    parcellation : str
        Name of parcellation to be used
    scale : str
        Scale of `parcellation` to be used
    spatnull : str
        Name of spin method to be used
    alpha : float
        Spatial autocorrelation parameter to be used

    Returns
    -------
    pvals : pd.DataFrame
    """

    print(spatnull, alpha, parcellation, scale)

    if spatnull == 'naive-para':
        return

    # load simulated data
    alphadir = SIMDIR / alpha
    if parcellation == 'vertex':
        x, y = simnulls.load_vertex_data(alphadir, sim=SIM)
    else:
        x, y = simnulls.load_parc_data(alphadir, parcellation, scale, sim=SIM)

    corr = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0]
    perms = np.loadtxt(alphadir / parcellation / 'nulls' / spatnull / 'pvals' /
                       f'{scale}_perms_{SIM}.csv')

    orig = pval_from_perms(corr, perms)
    pvals = defaultdict(list)
    for subset in [100, 500, 1000, 5000]:
        rs = np.random.default_rng(SEED)
        for n in range(N_PVALS):
            # select `subset` correlations from `perms` and calculate p-value
            # store the p-value and repeat `N_PVALS` times
            sub = rs.choice(perms, size=subset, replace=False)
            pvals[subset].append(pval_from_perms(corr, sub) - orig)
        # arrays are nicer than lists
        pvals[subset] = np.asarray(pvals[subset])

    df = pd.melt(pd.DataFrame(pvals), var_name='n_nulls', value_name='d(pval)')
    # add single p-value generated from 10000 nulls
    df = df.assign(parcellation=parcellation,
                   scale=scale,
                   spatnull=spatnull,
                   alpha=alpha)

    return df['parcellation', 'scale', 'spatnull', 'alpha', 'n_nulls',
              'd(pval)']
Example #5
0
def test_efficient_pearsonr_errors():
    with pytest.raises(ValueError):
        stats.efficient_pearsonr(range(4), range(5))

    assert all(np.isnan(a) for a in stats.efficient_pearsonr([], []))
Example #6
0
def test_efficient_pearsonr(x, y, expected):
    assert np.allclose(stats.efficient_pearsonr(x, y), expected)
Example #7
0
        xl = fg.axes[0, 0].get_xlim()
        for ax in fg.axes.flat:
            ax.hlines(0.05,
                      *xl,
                      linestyle='dashed',
                      color='black',
                      linewidth=1.0)
        savefig(fg.fig, FIGDIR / 'prob' / f'{parc}_probp05.svg')

    # plot shuffled correlation distributions as func of SA (vertex only)
    data = pd.DataFrame(columns=['alpha', 'corrs'])
    alphas = ['alpha-0.0', 'alpha-1.0', 'alpha-2.0', 'alpha-3.0']
    for alpha in alphas:
        x, y = load_vertex_data(SIMDIR / alpha, n_sim=1000)
        y = y[:, np.random.default_rng(1).permutation(1000)]
        corrs = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0]
        data = data.append(pd.DataFrame({
            'alpha': alpha,
            'corrs': corrs
        }),
                           ignore_index=True)
    colors = np.asarray(sns.color_palette('Greens', 10, desat=0.5))
    ax = sns.kdeplot(x='corrs',
                     hue='alpha',
                     data=data,
                     legend=False,
                     palette=list(colors[[2, 4, 6, 8]]),
                     hue_order=alphas,
                     clip=[-1, 1],
                     linewidth=3.0)
    ax.set(xlim=(-1.05, 1.05),
Example #8
0
                           surrs[1][:, 0],
                           s=75,
                           edgecolor=np.array([60, 60, 60]) / 255,
                           facecolor=np.array([223, 121, 122]) / 255)
                for side in ['right', 'top']:
                    ax.spines[side].set_visible(False)
                ax.set(xlabel='with medial wall', ylabel='without medial wall')
                l, h = ax.get_xlim()
                ax.plot([l, h], [l, h], zorder=0)
                ax.figure.savefig(figdir / f'{scale}.svg',
                                  bbox_inches='tight',
                                  transparent=True)
                plt.close(fig=fig)

                # save correlations b/w surrogates
                corrs = nnstats.efficient_pearsonr(*surrs)[0]
                fname = OUTDIR / name / method / f'{scale}.csv'
                fname.parent.mkdir(exist_ok=True, parents=True)
                np.savetxt(fname, corrs, fmt='%.10f')

            # make distplot of the correlations b/w surrogates for methods
            fig, ax = plt.subplots(1, 1)
            for method in METHODS:
                corrs = np.loadtxt(OUTDIR / name / method / f'{scale}.csv')
                ax = sns.kdeplot(corrs, label=method, shade=True, ax=ax)
            sns.despine(ax=ax, left=True)
            ax.set(xlim=(0, ax.get_xlim()[1]), xticks=[0, 0.5, 1], yticks=[])
            fname = FIGDIR / name / 'correlations' / f'{scale}.svg'
            fname.parent.mkdir(exist_ok=True)
            fig.savefig(fname, transparent=True, bbox_inches='tight')
            plt.close(fig=fig)
Example #9
0
def main():
    keys = [
        'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays',
        'behavioral_measures'
    ]

    # load processed data
    fname = op.join(directories.snf, f'scale500_deterministic.h5')
    hdf = structures.Frog(fname)
    data = [hdf.load(f'/processed/pd_{key}') for key in keys]

    # also load the gridsearch results back in to memory.
    # here, fusion is shape (K, M, N, N),
    #        zrand is shape (C, K, M)
    # where `K` is the nearest-neighbors parameter of SNF
    #       `M` is the scaling (mu) parameter of SNF, and
    #       `N` is PD patients
    fusion = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/fusion')
    zrand = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/zrand')
    consensus = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/consensus')

    print('=' * 80)
    print('Calculating variance explained by diffusion map embedding\n')
    mask = get_zrand_mask(zrand)
    embedding, realigned = get_embedding_variance(fusion[mask])

    print('\n' + '=' * 80)
    print('Calculating prediction model performance\n')
    run_prediction_models(hdf, feats=['pigd', 'tremor'])

    print('\n' + '=' * 80)
    print('Calculating diffusion map embedding dimension correlations\n')
    fig, corr_idxs = gen_scatterplots(data, embedding)
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'diffusion_correlations')
        utils.savefig(fname, fig)

    # load demographics information
    demographics = hdf.load('/raw/pd_demographics').reset_index()
    demographics = demographics.assign(cluster=consensus)
    pdatrophy = run_pdatrophy_anova(demographics,
                                    verbose=False,
                                    run_tukey=False)['atrophy']
    fig = gen_figure(data, embedding, realigned, consensus, pdatrophy,
                     corr_idxs)
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'diffusion_embedding')
        utils.savefig(fname, fig)

    # compare with PCA on concatenated data
    embedding = hdf['/snf/processed/all/sqeuclidean/gridsearch/embedding']
    consensus = hdf['/snf/processed/all/sqeuclidean/gridsearch/consensus']

    zdata = sstats.zscore(np.column_stack(data), ddof=1)
    u, s, v = np.linalg.svd(zdata, full_matrices=False)
    v = v.T
    pc_scores = zdata @ v

    # make figure for plot
    fig, axes = plt.subplots(2, 5, figsize=(25, 10))
    axes[0, 0].remove()
    axes[0, -1].remove()

    # first, we'll see how the pc_scores look plotted against one another
    # we'll use SNF-derived clusters to examine the distribution of patients
    axes[0, 1].scatter(pc_scores[:, 0],
                       pc_scores[:, 1],
                       c=consensus,
                       rasterized=True,
                       cmap=ListedColormap(defaults.three_cluster_cmap),
                       edgecolor=defaults.edgegray,
                       s=60,
                       linewidth=0.5)
    sns.despine(ax=axes[0, 1])
    axes[0, 1].set(xticklabels=[], yticklabels=[], xlabel='pc1', ylabel='pc2')

    # then, let's check how well pc_scores correlate with embedding scores
    corrs = efficient_pearsonr(pc_scores[:, :10], embedding)[0]
    for n, ax in enumerate(axes[0, 2:4]):
        sns.regplot(pc_scores[:, n],
                    embedding[:, n],
                    ax=ax,
                    ci=None,
                    scatter_kws=scatter_kws,
                    line_kws=line_kws)
        ax.set(xlabel=f'pc{n + 1}',
               ylabel=f'embedding dimension {n + 1}',
               xticklabels=[],
               yticklabels=[])
        sns.despine(ax=ax)
        ax.set_title(f'r = {corrs[n]:.2f}')

    for n, (ax, dt) in enumerate(zip(axes[1], data)):
        zdt = sstats.zscore(dt, ddof=1)
        u, s, v = np.linalg.svd(zdt, full_matrices=False)
        dt_scores = (zdt @ v.T)[:, 0]
        sns.regplot(dt_scores,
                    pc_scores[:, 0],
                    ax=ax,
                    ci=None,
                    scatter_kws=scatter_kws,
                    line_kws=line_kws)
        ax.set(ylabel='pc1\n(all data)' if n == 0 else '',
               xlabel=f'pc1\n({keys[n].replace("_", " ")})',
               xticklabels=[],
               yticklabels=[])
        sns.despine(ax=ax)
        ax.set_title(
            f'r = {np.corrcoef(pc_scores[:, 0], dt_scores)[0, 1]:.2f}')

    fig.tight_layout()
    if SAVE_FIGS:
        fname = op.join(directories.figs, 'principal_components')
        utils.savefig(fname, fig)
def run_null(parcellation, scale, spatnull, alpha):
    """
    Runs spatial null models for given combination of inputs

    Parameters
    ----------
    parcellation : str
        Name of parcellation to be used
    scale : str
        Scale of `parcellation` to be used
    spatnull : str
        Name of spin method to be used
    alpha : float
        Spatial autocorrelation parameter to be used
    """

    print(f'{time.ctime()}: {parcellation} {scale} {spatnull} {alpha} ',
          flush=True)

    # filenames (for I/O)
    spins_fn = SPDIR / parcellation / spatnull / f'{scale}_spins.csv'
    pvals_fn = (SIMDIR / alpha / parcellation / 'nulls' / spatnull
                / f'{scale}_nulls.csv')
    perms_fn = pvals_fn.parent / f'{scale}_perms.csv'

    if SHUFFLE:
        pvals_fn = pvals_fn.parent / f'{scale}_nulls_shuffle.csv'
        perms_fn = perms_fn.parent / f'{scale}_perms_shuffle.csv'

    if pvals_fn.exists() and perms_fn.exists():
        return

    # load simulated data
    alphadir = SIMDIR / alpha
    if parcellation == 'vertex':
        x, y = simnulls.load_vertex_data(alphadir, n_sim=N_SIM)
    else:
        x, y = simnulls.load_parc_data(alphadir, parcellation, scale,
                                       n_sim=N_SIM)

    # if we're computing info on SHUFFLED data, get the appropriate random `y`
    if SHUFFLE:
        y = _get_ysim(y, np.random.default_rng(1).permutation(N_SIM))

    # calculate the null p-values
    if spatnull == 'naive-para':
        pvals = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[1]
        perms = np.array([np.nan])
    elif spatnull == 'cornblath':
        fn = SPDIR / 'vertex' / 'vazquez-rodriguez' / 'fsaverage5_spins.csv'
        x, y = np.asarray(x), np.asarray(y)
        spins = simnulls.load_spins(fn, n_perm=N_PERM)
        fetcher = getattr(nndata, f"fetch_{parcellation.replace('atl-', '')}")
        annot = fetcher('fsaverage5', data_dir=ROIDIR)[scale]
        out = Parallel(n_jobs=N_PROC, max_nbytes=None)(
            delayed(_cornblath)(x[:, sim], y[:, sim], spins, annot)
            for sim in putils.trange(x.shape[-1], desc='Running simulations')
        )
        pvals, perms = zip(*out)
    elif spatnull == 'baum':
        x, y = np.asarray(x), np.asarray(y)
        spins = simnulls.load_spins(spins_fn, n_perm=N_PERM)
        out = Parallel(n_jobs=N_PROC, max_nbytes=None)(
            delayed(_baum)(x[:, sim], y[:, sim], spins)
            for sim in putils.trange(x.shape[-1], desc='Running simulations')
        )
        pvals, perms = zip(*out)
    elif spatnull in ('burt2018', 'burt2020', 'moran'):
        xarr = np.asarray(x)
        out = Parallel(n_jobs=N_PROC, max_nbytes=None)(
            delayed(_genmod)(xarr[:, sim], _get_ysim(y, sim),
                             parcellation, scale, spatnull)
            for sim in putils.trange(x.shape[-1], desc='Running simulations')
        )
        pvals, perms = zip(*out)
    else:  # vazquez-rodriguez, vasa, hungarian, naive-nonparametric
        x, y = np.asarray(x), np.asarray(y)
        spins = simnulls.load_spins(spins_fn, n_perm=N_PERM)
        out = Parallel(n_jobs=N_PROC, max_nbytes=None)(
            delayed(simnulls.calc_pval)(x[:, sim], y[:, sim], y[spins, sim])
            for sim in putils.trange(x.shape[-1], desc='Running simulations')
        )
        pvals, perms = zip(*out)

    # save to disk
    putils.save_dir(perms_fn, np.atleast_1d(perms), overwrite=False)
    putils.save_dir(pvals_fn, np.atleast_1d(pvals), overwrite=False)
def run_null(parcellation, scale, spatnull, alpha, sim):
    """
    Runs spatial null models for given combination of inputs

    Parameters
    ----------
    parcellation : str
        Name of parcellation to be used
    scale : str
        Scale of `parcellation` to be used
    spatnull : str
        Name of spin method to be used
    alpha : float
        Spatial autocorrelation parameter to be used
    sim : int
        Which simulation to run
    """

    print(
        f'{time.ctime()}: {parcellation} {scale} {spatnull} {alpha} '
        f'sim-{sim} ',
        flush=True)

    # filenames (for I/O)
    spins_fn = SPDIR / parcellation / spatnull / f'{scale}_spins.csv'
    pvals_fn = (SIMDIR / alpha / parcellation / 'nulls' / spatnull / 'pvals' /
                f'{scale}_nulls_{sim:04d}.csv')
    perms_fn = pvals_fn.parent / f'{scale}_perms_{sim:04d}.csv'
    moran_fn = pvals_fn.parent / f'{scale}_moran_{sim:04d}.csv'

    # load simulated data
    alphadir = SIMDIR / alpha
    if parcellation == 'vertex':
        loadfn = functools.partial(simnulls.load_vertex_data, alphadir)
    else:
        loadfn = functools.partial(simnulls.load_parc_data, alphadir,
                                   parcellation, scale)
    x, y = loadfn(sim=sim)

    # if we're computing info on SHUFFLED data, get the appropriate random `y`
    if SHUFFLE:
        _, y = loadfn(sim=np.random.default_rng(1).permutation(N_SIM)[sim])
        pvals_fn = pvals_fn.parent / f'{scale}_nulls_shuffle_{sim:04d}.csv'
        perms_fn = pvals_fn.parent / f'{scale}_perms_shuffle_{sim:04d}.csv'
        moran_fn = pvals_fn.parent / f'{scale}_moran_shuffle_{sim:04d}.csv'

    # if we're going to run moran for this simulation, pre-load distmat
    if RUN_MORAN and not moran_fn.exists():
        dist = simnulls.load_full_distmat(y, DISTDIR, parcellation, scale)

    # calculate the null p-values
    nulls = None
    if pvals_fn.exists() and perms_fn.exists():
        pvals, perms = np.loadtxt(pvals_fn), np.loadtxt(perms_fn)
    elif spatnull == 'naive-para':
        pvals = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[1]
        perms = np.array([np.nan])
    elif spatnull == 'cornblath':
        fn = SPDIR / 'vertex' / 'vazquez-rodriguez' / 'fsaverage5_spins.csv'
        x, y = np.asarray(x), np.asarray(y)
        spins = simnulls.load_spins(fn, n_perm=N_PERM)
        fetcher = getattr(nndata, f"fetch_{parcellation.replace('atl-', '')}")
        annot = fetcher('fsaverage5', data_dir=ROIDIR)[scale]
        nulls = nnsurf.spin_data(y,
                                 version='fsaverage5',
                                 lhannot=annot.lh,
                                 rhannot=annot.rh,
                                 spins=spins,
                                 n_rotate=spins.shape[-1])
        pvals, perms = simnulls.calc_pval(x, y, nulls)
    elif spatnull == 'baum':
        x, y = np.asarray(x), np.asarray(y)
        spins = simnulls.load_spins(spins_fn, n_perm=N_PERM)
        nulls = y[spins]
        nulls[spins == -1] = np.nan
        pvals, perms = simnulls.calc_pval(x, y, nulls)
    elif spatnull in ('burt2018', 'burt2020', 'moran'):
        nulls = make_surrogates(y, parcellation, scale, spatnull)
        pvals, perms = simnulls.calc_pval(x, y, nulls)
    else:  # vazquez-rodriguez, vasa, hungarian, naive-nonparametric
        x, y = np.asarray(x), np.asarray(y)
        spins = simnulls.load_spins(spins_fn, n_perm=N_PERM)
        nulls = y[spins]
        pvals, perms = simnulls.calc_pval(x, y, nulls)

    # save to disk
    putils.save_dir(perms_fn, np.atleast_1d(perms), overwrite=False)
    putils.save_dir(pvals_fn, np.atleast_1d(pvals), overwrite=False)

    # if we're running moran, do it now
    if RUN_MORAN and not moran_fn.exists() and nulls is not None:
        moran = simnulls.calc_moran(dist, nulls, n_jobs=N_PROC)
        putils.save_dir(moran_fn, np.atleast_1d(moran), overwrite=False)