def calc_pval(x, y, nulls): """ Calculates p-values for simulations in `x` and `y` using `spatnull` Parameters ---------- {x, y} : (N,) array_like Simulated GRF brain maps nulls : (N, P) array_like Null versions of `y` GRF brain map Returns ------- pval : float P-value of correlation for `x` and `y` against `nulls` perms : np.ndarray Correlations of `x` with `nulls` """ x, y, nulls = np.asanyarray(x), np.asanyarray(y), np.asanyarray(nulls) # calculate real + permuted correlation coefficients real = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0] perms = nnstats.efficient_pearsonr(x, nulls, nan_policy='omit')[0] pval = (np.sum(np.abs(perms) >= np.abs(real)) + 1) / (len(perms) + 1) return pval, perms
def get_runtime(parcellation, scale, spatnull): """ Runs spatial null models for given combination of inputs Parameters ---------- parcellation : str Name of parcellation to be used scale : str Scale of `parcellation` to be used spatnull : str Name of spin method to be used """ # filenames (for I/O) fn = SPDIR / parcellation / spatnull / f'{scale}_spins.csv' # load simulated data alphadir = SIMDIR / ALPHA if parcellation == 'vertex': x, y = simnulls.load_vertex_data(alphadir, sim=0) else: x, y = simnulls.load_parc_data(alphadir, parcellation, scale, sim=0) # start timer (after loading data--accounds for diff b/w vertex/parc) start = time.time() # calculate the null p-values if spatnull == 'naive-para': nnstats.efficient_pearsonr(x, y, nan_policy='omit')[1] nulls = None elif spatnull == 'naive-nonpara': nulls = naive_nonpara(y, fn=fn) elif spatnull == 'vazquez-rodriguez': nulls = vazquez_rodriguez(y, parcellation, scale, fn=fn) elif spatnull == 'vasa': nulls = vasa(y, parcellation, scale, fn=fn) elif spatnull == 'hungarian': nulls = hungarian(y, parcellation, scale, fn=fn) elif spatnull == 'cornblath': fn = SPDIR / 'vertex' / 'vazquez-rodriguez' / 'fsaverage5_spins.csv' nulls = cornblath(y, parcellation, scale, fn=fn) elif spatnull == 'baum': nulls = baum(y, parcellation, scale, fn=fn) elif spatnull in ('burt2018', 'burt2020', 'moran'): nulls = make_surrogates(y, parcellation, scale, spatnull, fn=fn) else: raise ValueError(f'Invalid spatnull: {spatnull}') if nulls is not None: simnulls.calc_pval(x, y, nulls) end = time.time() ct = CompTime(parcellation, scale, spatnull, end - start) print(ct) return asdict(ct)
def combine_nulls(parcellation, scale, spatnull, alpha): """ Combines outputs of all simulations into single files for provided inputs Parameters ---------- parcellation : str Name of parcellation to be used scale : str Scale of `parcellation` to be used spatnull : str Name of spin method to be used alpha : float Spatial autocorrelation parameter to be used """ print(f'{spatnull} {alpha} {parcellation} {scale}') nulldir = SIMDIR / alpha / parcellation / 'nulls' / spatnull pvals_fn = nulldir / f'{scale}_nulls.csv' perms_fn = nulldir / f'{scale}_perms.csv' # only some of the spatial null models were run in serial mode; these are # the ones that are missing the top-level file and whose outputs we need to # combine. do that here. if not pvals_fn.exists(): pvals, perms = np.zeros(N_SIM), np.zeros((N_PERM, N_SIM)) for sim in range(N_SIM): pvals[sim] = \ np.loadtxt(nulldir / 'pvals' / f'{scale}_nulls_{sim:04d}.csv') perms[:, sim] = \ np.loadtxt(nulldir / 'pvals' / f'{scale}_perms_{sim:04d}.csv') putils.save_dir(pvals_fn, pvals, overwrite=False) putils.save_dir(perms_fn, perms, overwrite=False) else: pvals = np.loadtxt(pvals_fn) # grab the empirical correlations for each simulation---good to have if parcellation == 'vertex': x, y = simnulls.load_vertex_data(SIMDIR / alpha, n_sim=N_SIM) else: x, y = simnulls.load_parc_data(SIMDIR / alpha, parcellation, scale, n_sim=N_SIM) corrs = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0] return pd.DataFrame( dict(parcellation=parcellation, scale=scale, spatnull=spatnull, alpha=alpha, corr=corrs, sim=range(len(pvals)), pval=pvals))
def pval_by_subsets(parcellation, scale, spatnull, alpha): """ Parameters ---------- parcellation : str Name of parcellation to be used scale : str Scale of `parcellation` to be used spatnull : str Name of spin method to be used alpha : float Spatial autocorrelation parameter to be used Returns ------- pvals : pd.DataFrame """ print(spatnull, alpha, parcellation, scale) if spatnull == 'naive-para': return # load simulated data alphadir = SIMDIR / alpha if parcellation == 'vertex': x, y = simnulls.load_vertex_data(alphadir, sim=SIM) else: x, y = simnulls.load_parc_data(alphadir, parcellation, scale, sim=SIM) corr = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0] perms = np.loadtxt(alphadir / parcellation / 'nulls' / spatnull / 'pvals' / f'{scale}_perms_{SIM}.csv') orig = pval_from_perms(corr, perms) pvals = defaultdict(list) for subset in [100, 500, 1000, 5000]: rs = np.random.default_rng(SEED) for n in range(N_PVALS): # select `subset` correlations from `perms` and calculate p-value # store the p-value and repeat `N_PVALS` times sub = rs.choice(perms, size=subset, replace=False) pvals[subset].append(pval_from_perms(corr, sub) - orig) # arrays are nicer than lists pvals[subset] = np.asarray(pvals[subset]) df = pd.melt(pd.DataFrame(pvals), var_name='n_nulls', value_name='d(pval)') # add single p-value generated from 10000 nulls df = df.assign(parcellation=parcellation, scale=scale, spatnull=spatnull, alpha=alpha) return df['parcellation', 'scale', 'spatnull', 'alpha', 'n_nulls', 'd(pval)']
def test_efficient_pearsonr_errors(): with pytest.raises(ValueError): stats.efficient_pearsonr(range(4), range(5)) assert all(np.isnan(a) for a in stats.efficient_pearsonr([], []))
def test_efficient_pearsonr(x, y, expected): assert np.allclose(stats.efficient_pearsonr(x, y), expected)
xl = fg.axes[0, 0].get_xlim() for ax in fg.axes.flat: ax.hlines(0.05, *xl, linestyle='dashed', color='black', linewidth=1.0) savefig(fg.fig, FIGDIR / 'prob' / f'{parc}_probp05.svg') # plot shuffled correlation distributions as func of SA (vertex only) data = pd.DataFrame(columns=['alpha', 'corrs']) alphas = ['alpha-0.0', 'alpha-1.0', 'alpha-2.0', 'alpha-3.0'] for alpha in alphas: x, y = load_vertex_data(SIMDIR / alpha, n_sim=1000) y = y[:, np.random.default_rng(1).permutation(1000)] corrs = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[0] data = data.append(pd.DataFrame({ 'alpha': alpha, 'corrs': corrs }), ignore_index=True) colors = np.asarray(sns.color_palette('Greens', 10, desat=0.5)) ax = sns.kdeplot(x='corrs', hue='alpha', data=data, legend=False, palette=list(colors[[2, 4, 6, 8]]), hue_order=alphas, clip=[-1, 1], linewidth=3.0) ax.set(xlim=(-1.05, 1.05),
surrs[1][:, 0], s=75, edgecolor=np.array([60, 60, 60]) / 255, facecolor=np.array([223, 121, 122]) / 255) for side in ['right', 'top']: ax.spines[side].set_visible(False) ax.set(xlabel='with medial wall', ylabel='without medial wall') l, h = ax.get_xlim() ax.plot([l, h], [l, h], zorder=0) ax.figure.savefig(figdir / f'{scale}.svg', bbox_inches='tight', transparent=True) plt.close(fig=fig) # save correlations b/w surrogates corrs = nnstats.efficient_pearsonr(*surrs)[0] fname = OUTDIR / name / method / f'{scale}.csv' fname.parent.mkdir(exist_ok=True, parents=True) np.savetxt(fname, corrs, fmt='%.10f') # make distplot of the correlations b/w surrogates for methods fig, ax = plt.subplots(1, 1) for method in METHODS: corrs = np.loadtxt(OUTDIR / name / method / f'{scale}.csv') ax = sns.kdeplot(corrs, label=method, shade=True, ax=ax) sns.despine(ax=ax, left=True) ax.set(xlim=(0, ax.get_xlim()[1]), xticks=[0, 0.5, 1], yticks=[]) fname = FIGDIR / name / 'correlations' / f'{scale}.svg' fname.parent.mkdir(exist_ok=True) fig.savefig(fname, transparent=True, bbox_inches='tight') plt.close(fig=fig)
def main(): keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures' ] # load processed data fname = op.join(directories.snf, f'scale500_deterministic.h5') hdf = structures.Frog(fname) data = [hdf.load(f'/processed/pd_{key}') for key in keys] # also load the gridsearch results back in to memory. # here, fusion is shape (K, M, N, N), # zrand is shape (C, K, M) # where `K` is the nearest-neighbors parameter of SNF # `M` is the scaling (mu) parameter of SNF, and # `N` is PD patients fusion = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/fusion') zrand = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/zrand') consensus = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/consensus') print('=' * 80) print('Calculating variance explained by diffusion map embedding\n') mask = get_zrand_mask(zrand) embedding, realigned = get_embedding_variance(fusion[mask]) print('\n' + '=' * 80) print('Calculating prediction model performance\n') run_prediction_models(hdf, feats=['pigd', 'tremor']) print('\n' + '=' * 80) print('Calculating diffusion map embedding dimension correlations\n') fig, corr_idxs = gen_scatterplots(data, embedding) if SAVE_FIGS: fname = op.join(directories.figs, 'diffusion_correlations') utils.savefig(fname, fig) # load demographics information demographics = hdf.load('/raw/pd_demographics').reset_index() demographics = demographics.assign(cluster=consensus) pdatrophy = run_pdatrophy_anova(demographics, verbose=False, run_tukey=False)['atrophy'] fig = gen_figure(data, embedding, realigned, consensus, pdatrophy, corr_idxs) if SAVE_FIGS: fname = op.join(directories.figs, 'diffusion_embedding') utils.savefig(fname, fig) # compare with PCA on concatenated data embedding = hdf['/snf/processed/all/sqeuclidean/gridsearch/embedding'] consensus = hdf['/snf/processed/all/sqeuclidean/gridsearch/consensus'] zdata = sstats.zscore(np.column_stack(data), ddof=1) u, s, v = np.linalg.svd(zdata, full_matrices=False) v = v.T pc_scores = zdata @ v # make figure for plot fig, axes = plt.subplots(2, 5, figsize=(25, 10)) axes[0, 0].remove() axes[0, -1].remove() # first, we'll see how the pc_scores look plotted against one another # we'll use SNF-derived clusters to examine the distribution of patients axes[0, 1].scatter(pc_scores[:, 0], pc_scores[:, 1], c=consensus, rasterized=True, cmap=ListedColormap(defaults.three_cluster_cmap), edgecolor=defaults.edgegray, s=60, linewidth=0.5) sns.despine(ax=axes[0, 1]) axes[0, 1].set(xticklabels=[], yticklabels=[], xlabel='pc1', ylabel='pc2') # then, let's check how well pc_scores correlate with embedding scores corrs = efficient_pearsonr(pc_scores[:, :10], embedding)[0] for n, ax in enumerate(axes[0, 2:4]): sns.regplot(pc_scores[:, n], embedding[:, n], ax=ax, ci=None, scatter_kws=scatter_kws, line_kws=line_kws) ax.set(xlabel=f'pc{n + 1}', ylabel=f'embedding dimension {n + 1}', xticklabels=[], yticklabels=[]) sns.despine(ax=ax) ax.set_title(f'r = {corrs[n]:.2f}') for n, (ax, dt) in enumerate(zip(axes[1], data)): zdt = sstats.zscore(dt, ddof=1) u, s, v = np.linalg.svd(zdt, full_matrices=False) dt_scores = (zdt @ v.T)[:, 0] sns.regplot(dt_scores, pc_scores[:, 0], ax=ax, ci=None, scatter_kws=scatter_kws, line_kws=line_kws) ax.set(ylabel='pc1\n(all data)' if n == 0 else '', xlabel=f'pc1\n({keys[n].replace("_", " ")})', xticklabels=[], yticklabels=[]) sns.despine(ax=ax) ax.set_title( f'r = {np.corrcoef(pc_scores[:, 0], dt_scores)[0, 1]:.2f}') fig.tight_layout() if SAVE_FIGS: fname = op.join(directories.figs, 'principal_components') utils.savefig(fname, fig)
def run_null(parcellation, scale, spatnull, alpha): """ Runs spatial null models for given combination of inputs Parameters ---------- parcellation : str Name of parcellation to be used scale : str Scale of `parcellation` to be used spatnull : str Name of spin method to be used alpha : float Spatial autocorrelation parameter to be used """ print(f'{time.ctime()}: {parcellation} {scale} {spatnull} {alpha} ', flush=True) # filenames (for I/O) spins_fn = SPDIR / parcellation / spatnull / f'{scale}_spins.csv' pvals_fn = (SIMDIR / alpha / parcellation / 'nulls' / spatnull / f'{scale}_nulls.csv') perms_fn = pvals_fn.parent / f'{scale}_perms.csv' if SHUFFLE: pvals_fn = pvals_fn.parent / f'{scale}_nulls_shuffle.csv' perms_fn = perms_fn.parent / f'{scale}_perms_shuffle.csv' if pvals_fn.exists() and perms_fn.exists(): return # load simulated data alphadir = SIMDIR / alpha if parcellation == 'vertex': x, y = simnulls.load_vertex_data(alphadir, n_sim=N_SIM) else: x, y = simnulls.load_parc_data(alphadir, parcellation, scale, n_sim=N_SIM) # if we're computing info on SHUFFLED data, get the appropriate random `y` if SHUFFLE: y = _get_ysim(y, np.random.default_rng(1).permutation(N_SIM)) # calculate the null p-values if spatnull == 'naive-para': pvals = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[1] perms = np.array([np.nan]) elif spatnull == 'cornblath': fn = SPDIR / 'vertex' / 'vazquez-rodriguez' / 'fsaverage5_spins.csv' x, y = np.asarray(x), np.asarray(y) spins = simnulls.load_spins(fn, n_perm=N_PERM) fetcher = getattr(nndata, f"fetch_{parcellation.replace('atl-', '')}") annot = fetcher('fsaverage5', data_dir=ROIDIR)[scale] out = Parallel(n_jobs=N_PROC, max_nbytes=None)( delayed(_cornblath)(x[:, sim], y[:, sim], spins, annot) for sim in putils.trange(x.shape[-1], desc='Running simulations') ) pvals, perms = zip(*out) elif spatnull == 'baum': x, y = np.asarray(x), np.asarray(y) spins = simnulls.load_spins(spins_fn, n_perm=N_PERM) out = Parallel(n_jobs=N_PROC, max_nbytes=None)( delayed(_baum)(x[:, sim], y[:, sim], spins) for sim in putils.trange(x.shape[-1], desc='Running simulations') ) pvals, perms = zip(*out) elif spatnull in ('burt2018', 'burt2020', 'moran'): xarr = np.asarray(x) out = Parallel(n_jobs=N_PROC, max_nbytes=None)( delayed(_genmod)(xarr[:, sim], _get_ysim(y, sim), parcellation, scale, spatnull) for sim in putils.trange(x.shape[-1], desc='Running simulations') ) pvals, perms = zip(*out) else: # vazquez-rodriguez, vasa, hungarian, naive-nonparametric x, y = np.asarray(x), np.asarray(y) spins = simnulls.load_spins(spins_fn, n_perm=N_PERM) out = Parallel(n_jobs=N_PROC, max_nbytes=None)( delayed(simnulls.calc_pval)(x[:, sim], y[:, sim], y[spins, sim]) for sim in putils.trange(x.shape[-1], desc='Running simulations') ) pvals, perms = zip(*out) # save to disk putils.save_dir(perms_fn, np.atleast_1d(perms), overwrite=False) putils.save_dir(pvals_fn, np.atleast_1d(pvals), overwrite=False)
def run_null(parcellation, scale, spatnull, alpha, sim): """ Runs spatial null models for given combination of inputs Parameters ---------- parcellation : str Name of parcellation to be used scale : str Scale of `parcellation` to be used spatnull : str Name of spin method to be used alpha : float Spatial autocorrelation parameter to be used sim : int Which simulation to run """ print( f'{time.ctime()}: {parcellation} {scale} {spatnull} {alpha} ' f'sim-{sim} ', flush=True) # filenames (for I/O) spins_fn = SPDIR / parcellation / spatnull / f'{scale}_spins.csv' pvals_fn = (SIMDIR / alpha / parcellation / 'nulls' / spatnull / 'pvals' / f'{scale}_nulls_{sim:04d}.csv') perms_fn = pvals_fn.parent / f'{scale}_perms_{sim:04d}.csv' moran_fn = pvals_fn.parent / f'{scale}_moran_{sim:04d}.csv' # load simulated data alphadir = SIMDIR / alpha if parcellation == 'vertex': loadfn = functools.partial(simnulls.load_vertex_data, alphadir) else: loadfn = functools.partial(simnulls.load_parc_data, alphadir, parcellation, scale) x, y = loadfn(sim=sim) # if we're computing info on SHUFFLED data, get the appropriate random `y` if SHUFFLE: _, y = loadfn(sim=np.random.default_rng(1).permutation(N_SIM)[sim]) pvals_fn = pvals_fn.parent / f'{scale}_nulls_shuffle_{sim:04d}.csv' perms_fn = pvals_fn.parent / f'{scale}_perms_shuffle_{sim:04d}.csv' moran_fn = pvals_fn.parent / f'{scale}_moran_shuffle_{sim:04d}.csv' # if we're going to run moran for this simulation, pre-load distmat if RUN_MORAN and not moran_fn.exists(): dist = simnulls.load_full_distmat(y, DISTDIR, parcellation, scale) # calculate the null p-values nulls = None if pvals_fn.exists() and perms_fn.exists(): pvals, perms = np.loadtxt(pvals_fn), np.loadtxt(perms_fn) elif spatnull == 'naive-para': pvals = nnstats.efficient_pearsonr(x, y, nan_policy='omit')[1] perms = np.array([np.nan]) elif spatnull == 'cornblath': fn = SPDIR / 'vertex' / 'vazquez-rodriguez' / 'fsaverage5_spins.csv' x, y = np.asarray(x), np.asarray(y) spins = simnulls.load_spins(fn, n_perm=N_PERM) fetcher = getattr(nndata, f"fetch_{parcellation.replace('atl-', '')}") annot = fetcher('fsaverage5', data_dir=ROIDIR)[scale] nulls = nnsurf.spin_data(y, version='fsaverage5', lhannot=annot.lh, rhannot=annot.rh, spins=spins, n_rotate=spins.shape[-1]) pvals, perms = simnulls.calc_pval(x, y, nulls) elif spatnull == 'baum': x, y = np.asarray(x), np.asarray(y) spins = simnulls.load_spins(spins_fn, n_perm=N_PERM) nulls = y[spins] nulls[spins == -1] = np.nan pvals, perms = simnulls.calc_pval(x, y, nulls) elif spatnull in ('burt2018', 'burt2020', 'moran'): nulls = make_surrogates(y, parcellation, scale, spatnull) pvals, perms = simnulls.calc_pval(x, y, nulls) else: # vazquez-rodriguez, vasa, hungarian, naive-nonparametric x, y = np.asarray(x), np.asarray(y) spins = simnulls.load_spins(spins_fn, n_perm=N_PERM) nulls = y[spins] pvals, perms = simnulls.calc_pval(x, y, nulls) # save to disk putils.save_dir(perms_fn, np.atleast_1d(perms), overwrite=False) putils.save_dir(pvals_fn, np.atleast_1d(pvals), overwrite=False) # if we're running moran, do it now if RUN_MORAN and not moran_fn.exists() and nulls is not None: moran = simnulls.calc_moran(dist, nulls, n_jobs=N_PROC) putils.save_dir(moran_fn, np.atleast_1d(moran), overwrite=False)