def main(): keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures' ] # load processed data fname = op.join(directories.snf, f'scale500_deterministic.h5') hdf = structures.Frog(fname) data = [hdf.load(f'/processed/pd_{key}') for key in keys] # compare clustering + embedding results for ALL vs NO-MRI data print('=' * 80) print('Comparing SNF outputs with and without MRI features\n') all_consensus, nomri_consensus = compare_nomri_clusters(hdf) # generate demographic dataframe with NO-MRI cluster labels demographics = hdf.load('/raw/pd_demographics').reset_index() demographics = demographics.assign(cluster=pd.Categorical(nomri_consensus)) # run one-way and two-way ANOVA to assess cluster discriminability print('\n' + '=' * 80) print('Testing no-MRI cluster differences in PD-ICA atrophy score\n') run_pdatrophy_anova(demographics) print('\n' + '=' * 80) print('Running mass-univariate ANOVA for no-MRI cluster differences\n') run_univariate_anova(data, demographics, run_tukey=False) print('\n' + '=' * 80) print('Running two-way ANOVA comparing SNF w/ and w/o MRI features\n') run_twoway_anova(data, all_consensus, nomri_consensus)
def main(): # load HDF file fname = op.join(directories.snf, f'scale500_deterministic.h5') hdf = structures.Frog(fname) print('=' * 80) print('Calculating supplementary cluster demographic information\n') get_cluster_demographics(hdf) print('\n' + '=' * 80) print('Comparing Fereshtehnejad et al., 2017 clustering results\n') compare_fereshtehnejad2017(hdf) print('\n' + '=' * 80) print('Comparing alternative distance metrics\n') compare_alt_distance(hdf) print('\n' + '=' * 80) print('Comparing SNF results across regions of hyperparameter space\n') compare_hyperparameter_variation(hdf) print('\n' + '=' * 80) print('Comparing SNF results of HC & PD clustering with ground truth\n') fig = compare_hcpd_snf(hdf) if SAVE_FIGS: fname = op.join(directories.figs, 'pdhc_clustering') utils.savefig(fname, fig)
def load_and_residualize_data(hdf, groups=None): """ Loads raw data and returns residualized outputs Parameters ---------- hdf : structures.Frog HDF5 file with saved data groups : list of str, optional Which groups to load. Must be in ['pd', 'hc']. If not specified all groups are loaded. Default: None Returns ------- data : list of pandas.DataFrame Residualized data """ if isinstance(hdf, str): hdf = structures.Frog(hdf) if groups is None: groups = ['pd', 'hc'] raw_data = [ pd.concat([hdf[f'/raw/{group}_{key}'] for group in groups]) for key in KEYS ] regressors = [ pd.concat([hdf[f'/regressors/{group}_{key}'] for group in groups]) for key in KEYS ] proc_data = [] for data, reg in zip(raw_data, regressors): resid = pd.DataFrame(stats.residualize(reg, data, normalize=False), index=data.index, columns=data.columns) proc_data.append(resid) return proc_data
def main(): # N.B. this will NOT work unless you set the environmental variables # $PPMI_USER and $PPMI_PASSWORD prior to running this script. # these variables must be the username and password you received when # registering for the PPMI. for more information on data access see: # https://www.ppmi-info.org/access-data-specimens/download-data/ pypmi.fetch_studydata('all', path=directories.ppmi, overwrite=False) # load demographic data and keep only individuals with PD and healthy # individuals. we'll use the information in this data frame to residualize # our data against different variables (e.g., age, gender) print('Loading demographics information...') demographics = pypmi.load_demographics(directories.ppmi) \ .query('diagnosis in ["pd", "hc"]') \ .set_index('participant') demographics['family_history'] = demographics['family_history'].astype(bool) # load all non-MRI data print('Loading all non-MRI data (this step may take some time)...') datscan = pypmi.load_datscan(directories.ppmi, measures='all') biospec = pypmi.load_biospecimen(directories.ppmi, measures='all') behavior = pypmi.load_behavior(directories.ppmi, measures='all') # sometimes, because of how PPMI data were collected, there are slight # variations in the recorded date for the same visit, resulting in scores # for a single visit being split across two or more rows in the dataframe # (i.e., one row might have MoCA scores for visit "V01" and the other has # UPDRS scores for visit "V01") # to remedy this we use pandas `DataFrame.combine_first()` method, merging # scores from both rows and retaining the earliest date as the "true" date # (dates were generally only ~1 month different and if that difference # makes a significant impact on our results then I quit) print('Wrangling non-MRI data into a usable format...') first = behavior.drop_duplicates(['participant', 'visit'], 'first') \ .reset_index(drop=True) last = behavior.drop_duplicates(['participant', 'visit'], 'last') \ .reset_index(drop=True) behavior = first.combine_first(last) # get first visit scores for non-MRI data datscan, dat_date = get_visit(datscan, list(demographics.index), visit='SC') biospec, bio_date = get_visit(biospec, list(demographics.index), visit='BL') # behavioral data acquisition was split across screening + baseline visits # so we need to take the earliest visit for each measure # that is, not all measures were collected at screening so we need to use # the baseline visit scores for those measures # unfortunately which visit various measures were initially collected at # DIFFERED for PD and HC individuals, so we need to do this separately for # the two groups and then merge them back together... ¯\_(ツ)_/¯ beh, beh_dates = [], [] for diagnosis in ['pd', 'hc']: participants = demographics.query(f'diagnosis == "{diagnosis}"').index beh_sc, beh_date = get_visit(behavior, list(participants), visit='SC') beh_bl, _ = get_visit(behavior, list(participants), visit='BL') drop = np.intersect1d(beh_sc.columns, beh_bl.columns) beh += [pd.merge(beh_sc, beh_bl.drop(drop, axis=1), on='participant')] beh_dates += [beh_date] behavior = pd.concat(beh, join='inner') beh_date = pd.concat(beh_dates, join='inner') # iterate through all combinations of cortical + subcortical parcellations # note: there's only one subcortical parcellation (we had considered doing # more but the number of good subcortical parcellations is...limited) cth_data = sorted(glob.glob(op.join(directories.parcels, '*thickness.npy'))) vol_data = sorted(glob.glob(op.join(directories.parcels, '*volume.npy'))) for cth, vol in itertools.product(cth_data, vol_data): # determine what cortical / subcortical parcellation combo we're using # this will determine the name of the output file # the specific details include the resolution of cortical parcellation # and the datatype of the subcortical parcellation (scale, ) = re.search(r'res-(\d+)', cth).groups() (dtype, ) = re.search(r'_hemi-both_(\S+)_', vol).groups() hdf = structures.Frog(op.join(directories.snf, f'scale{scale}_{dtype}.h5')) print(f'Loading MRI data for {op.basename(hdf.filename)}...') # load parcellated cortical thickness data ct_parc = nndata.fetch_cammoun2012(data_dir=directories.rois, verbose=0)['info'] ct_parc = pd.read_csv(ct_parc).query(f'scale == "scale{scale}" ' '& structure == "cortex"') ct_parc['label'] = (ct_parc['label'] + '_' + ct_parc['hemisphere'].apply(str.lower)) cortthick, cth_date = get_parcels(cth, session=1, return_date=True, parcellation=ct_parc) # load parcellated subcortical volume data sv_parc = nndata.fetch_pauli2018(data_dir=directories.rois, verbose=0)['info'] sv_parc = pd.read_csv(sv_parc) subvolume, vol_date = get_parcels(vol, session=1, return_date=True, parcellation=sv_parc) # perform batch correction on MRI data # first, grab the demographics of subjects for whom we have neuro data. # then, remove all sites where we only have data from one subject since # we cannot generate batch correction parameters in these instances. # finally, perform the actual batch correction using `neurocombat` cortthick, subvolume, demo = \ preprocess.intersect_subjects(cortthick, subvolume, demographics) sites, counts = np.unique(demo['site'], return_counts=True) demo = demo[demo['site'].isin(sites[counts > 1])] cortthick, subvolume, demo = \ preprocess.intersect_subjects(cortthick, subvolume, demo) cortthick.iloc[:, :] = batch_correct(cortthick, demo) subvolume.iloc[:, :] = batch_correct(subvolume, demo) # only keep subjects for whom we have all datatypes # we preprocess HC and PD data separately because part of the process # involves imputation and we want to impute missing data using values # from each diagnostic group, separately data = [cortthick, subvolume, datscan, biospec, behavior] *data, demo = preprocess.intersect_subjects(*data, demo) hc_data, hc_demo = snfprep(data, demo.query('diagnosis == "hc"')) pd_data, pd_demo = snfprep(data, demo.query('diagnosis == "pd"')) # only keep features for which we have both PD and HC data for n, (hc_dtype, pd_dtype) in enumerate(zip(hc_data, pd_data)): cols = np.intersect1d(hc_dtype.columns, pd_dtype.columns) hc_data[n], pd_data[n] = hc_data[n][cols], pd_data[n][cols] # "regress out" age, gender, age x gender interactions (and total # estimated intracranial volume, if MRI data) from all data. # we also want to save all this data to disk so we can load it easily # in the future! do that for all the raw data, regressor matrices, and # processed (i.e., residualized) data # we do this because we don't want these sorts of things to bias our # initial analyses when creating the fused networks keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures' ] dates = [cth_date, vol_date, dat_date, bio_date, beh_date] for grp, dataset, demo in zip(['pd', 'hc'], [pd_data, hc_data], [pd_demo, hc_demo]): hdf.save(demo, f'/raw/{grp}_demographics', overwrite=False) for n, (df, key, date) in enumerate(zip(dataset, keys, dates)): reg = gen_regressors(date, demo) # get comparative regressors / data (this is always healthy # inviduals -- we use them to estimate the betas for the # residualization process) comp_reg, comp_df = gen_regressors(date, hc_demo), hc_data[n] resid = nnstats.residualize(reg, df, comp_reg, comp_df, normalize=False) resid = pd.DataFrame(resid, index=df.index, columns=df.columns) hdf.save(df, f'/raw/{grp}_{key}', overwrite=False) hdf.save(reg, f'/regressors/{grp}_{key}', overwrite=False) hdf.save(resid, f'/processed/{grp}_{key}', overwrite=False)
def get_nmi_mod(method, print_summary=True): """ Gets normalized mutual information and modularity for `method` Parameters ---------- method : {'snf', 'rbf'} Method to use for calculating metrics print_summary : bool, optional Whether to print summary statistics (mean, SD, ranges) for generated metrics Returns ------- nmi : numpy.ndarray Normalized mutual information mod : numpy.ndarray Modularity estimates """ methods = ['snf', 'rbf'] if method not in methods: raise ValueError(f'Provided `method` {method} invalid.') scales = [f'scale{f}' for f in ['033', '060', '125', '250', '500']] keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures', 'all' ] # iterate over all CT dimensionalities and generate NMI / mod estimates nmi, mod = [], [] for scale in scales: # get data for provided scale fname = op.join(directories.snf, f'{scale}_deterministic.h5') hdf = structures.Frog(fname) pd_data = [hdf.load(f'/processed/pd_{key}') for key in keys[:-1]] # generate affinity matrix and cluster labels # if we're using SNF we can just pre-load the matrices + labels if method == 'snf': path = '/snf/processed/{}/sqeuclidean/gridsearch/{}' affinities = [ hdf.load(path.format(key, 'fusion_avg')) for key in keys ] labels = [hdf.load(path.format(key, 'consensus')) for key in keys] # otherwise, we have to generate the affinities using cosine similarity # and then use spectral clustering to generate the labels elif method == 'rbf': affinities = [ metrics.pairwise.cosine_similarity(sstats.zscore(f)) + 1 for f in pd_data ] + [ metrics.pairwise.cosine_similarity( sstats.zscore(np.column_stack(pd_data))) + 1 ] labels = [ spectral_clustering(aff, n_clusters=3, random_state=1234) for aff in affinities ] # get NMI + modularity estimates nmi.append(snf.metrics.nmi(labels)[-1, :-1]) mod.append(list(gen_mod(affinities[:-1], labels[-1]))) nmi, mod = np.asarray(nmi), np.asarray(mod) if print_summary: _print_summary(nmi, 'NMI') print() _print_summary(mod, 'modularity') print() return nmi, mod
def main(): keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures' ] # load processed data fname = op.join(directories.snf, f'scale500_deterministic.h5') hdf = structures.Frog(fname) data = [hdf.load(f'/processed/pd_{key}') for key in keys] # also load the gridsearch results back in to memory. # here, fusion is shape (K, M, N, N), # zrand is shape (C, K, M) # where `K` is the nearest-neighbors parameter of SNF # `M` is the scaling (mu) parameter of SNF, and # `N` is PD patients fusion = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/fusion') zrand = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/zrand') consensus = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/consensus') print('=' * 80) print('Calculating variance explained by diffusion map embedding\n') mask = get_zrand_mask(zrand) embedding, realigned = get_embedding_variance(fusion[mask]) print('\n' + '=' * 80) print('Calculating prediction model performance\n') run_prediction_models(hdf, feats=['pigd', 'tremor']) print('\n' + '=' * 80) print('Calculating diffusion map embedding dimension correlations\n') fig, corr_idxs = gen_scatterplots(data, embedding) if SAVE_FIGS: fname = op.join(directories.figs, 'diffusion_correlations') utils.savefig(fname, fig) # load demographics information demographics = hdf.load('/raw/pd_demographics').reset_index() demographics = demographics.assign(cluster=consensus) pdatrophy = run_pdatrophy_anova(demographics, verbose=False, run_tukey=False)['atrophy'] fig = gen_figure(data, embedding, realigned, consensus, pdatrophy, corr_idxs) if SAVE_FIGS: fname = op.join(directories.figs, 'diffusion_embedding') utils.savefig(fname, fig) # compare with PCA on concatenated data embedding = hdf['/snf/processed/all/sqeuclidean/gridsearch/embedding'] consensus = hdf['/snf/processed/all/sqeuclidean/gridsearch/consensus'] zdata = sstats.zscore(np.column_stack(data), ddof=1) u, s, v = np.linalg.svd(zdata, full_matrices=False) v = v.T pc_scores = zdata @ v # make figure for plot fig, axes = plt.subplots(2, 5, figsize=(25, 10)) axes[0, 0].remove() axes[0, -1].remove() # first, we'll see how the pc_scores look plotted against one another # we'll use SNF-derived clusters to examine the distribution of patients axes[0, 1].scatter(pc_scores[:, 0], pc_scores[:, 1], c=consensus, rasterized=True, cmap=ListedColormap(defaults.three_cluster_cmap), edgecolor=defaults.edgegray, s=60, linewidth=0.5) sns.despine(ax=axes[0, 1]) axes[0, 1].set(xticklabels=[], yticklabels=[], xlabel='pc1', ylabel='pc2') # then, let's check how well pc_scores correlate with embedding scores corrs = efficient_pearsonr(pc_scores[:, :10], embedding)[0] for n, ax in enumerate(axes[0, 2:4]): sns.regplot(pc_scores[:, n], embedding[:, n], ax=ax, ci=None, scatter_kws=scatter_kws, line_kws=line_kws) ax.set(xlabel=f'pc{n + 1}', ylabel=f'embedding dimension {n + 1}', xticklabels=[], yticklabels=[]) sns.despine(ax=ax) ax.set_title(f'r = {corrs[n]:.2f}') for n, (ax, dt) in enumerate(zip(axes[1], data)): zdt = sstats.zscore(dt, ddof=1) u, s, v = np.linalg.svd(zdt, full_matrices=False) dt_scores = (zdt @ v.T)[:, 0] sns.regplot(dt_scores, pc_scores[:, 0], ax=ax, ci=None, scatter_kws=scatter_kws, line_kws=line_kws) ax.set(ylabel='pc1\n(all data)' if n == 0 else '', xlabel=f'pc1\n({keys[n].replace("_", " ")})', xticklabels=[], yticklabels=[]) sns.despine(ax=ax) ax.set_title( f'r = {np.corrcoef(pc_scores[:, 0], dt_scores)[0, 1]:.2f}') fig.tight_layout() if SAVE_FIGS: fname = op.join(directories.figs, 'principal_components') utils.savefig(fname, fig)
def main(): keys = [ 'cortical_thickness', 'subcortical_volume', 'dat_scans', 'csf_assays', 'behavioral_measures' ] # load processed data fname = op.join(directories.snf, f'scale500_deterministic.h5') hdf = structures.Frog(fname) data = [hdf.load(f'/processed/pd_{key}') for key in keys] # load demographics information demographics = hdf.load('/raw/pd_demographics') demographics['gender'] = demographics[ 'gender'].cat.remove_unused_categories() # also load the gridsearch results back in to memory. # here, labels is shape (K, M, C, N), and # zrand is shape (C, K, M) # where `K` is the nearest-neighbors parameter of SNF # `M` is the scaling (mu) parameter of SNF, and # `C` is the different cluster # solutions (2, 3, & 4 clusters), and # `N` is PD patients labels = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/labels') zrand = hdf.load('/snf/processed/all/sqeuclidean/gridsearch/zrand') # find consensus clusters and update demographics table print('=' * 80) print('Generating consensus clustering assignments\n') assignments, consensus, agreement = get_consensus_clusters(labels, zrand) demographics = demographics.assign(cluster=pd.Categorical(consensus)) # run all the different tests assessing the clusters print('\n' + '=' * 80) print('Testing modularity of agreement matrix for consensus clusters\n') run_modularity_test(agreement, consensus) print('\n' + '=' * 80) print('Testing cluster differences for confounding variables\n') run_confound_tests(demographics) print('\n' + '=' * 80) print('Running mass-univariate ANOVA for cluster differences\n') run_univariate_anova(data, demographics, run_tukey=False) print('\n' + '=' * 80) print('Testing cluster differences in PD-ICA atrophy score\n') run_pdatrophy_anova(demographics.reset_index()) print('\n' + '=' * 80) print('Running longitudinal models for PIGD + tremor scores\n') run_lme(demographics, 'pigd') run_lme(demographics, 'tremor') # use all this info to generate what will serve as the basis for figure 4 fig = gen_figure(data, demographics, agreement, consensus, assignments, zrand) if SAVE_FIGS: fname = op.join(directories.figs, 'patient_clusters') utils.savefig(fname, fig) # figures for comparing longitudinal outcomes from SNF-derived biotypes # with biotypes derived from other subsets of data / methodologies # first, start with biotypes produced from all data w/SNF path = '/snf/processed/{}/sqeuclidean/gridsearch/consensus' clusters = [ hdf[path.format('all')], hdf[path.format('behavioral_measures')], spectral_clustering(metrics.pairwise.cosine_similarity( sstats.zscore(np.column_stack(data), ddof=1)) + 1, n_clusters=3, random_state=1234) ] for clust, fn in zip(clusters, ['snf', 'behavior', 'concatenate']): demographics = demographics.assign(cluster=pd.Categorical(clust)) fig = supplementary_longitudinal_outcomes(demographics) run_lme(demographics, 'pigd') run_lme(demographics, 'tremor') if SAVE_FIGS: utils.savefig(op.join(directories.figs, 'supp_long', fn), fig) plt.close(fig=fig)
def main(): # grab all the HDF5 files that exist for hdf in sorted(glob.glob(op.join(directories.snf, '*.h5'))): # prepare HDF file and pre-load data hdf = structures.Frog(hdf) data = [hdf.load(f'/processed/pd_{key}') for key in KEYS] # the only gridsearches we need to run for all the resolutions are the # basic one where we save out `fusion_avg` and `consensus` # we need ALL teh data modalities, and each data modality independently run_gridsearch(data=data, hdf=hdf, path='processed/all', saveall=False, metrics='sqeuclidean') for n, key in enumerate(KEYS): run_gridsearch(data=[data[n]], hdf=hdf, path=f'processed/{key}', saveall=False, metrics='sqeuclidean') # for the highest resolution data we want to run a BUNCH of auxiliary # analyses, though hdf = op.join(directories.snf, 'scale500_deterministic.h5') hdf = structures.Frog(hdf) data = [hdf.load(f'/processed/pd_{key}') for key in KEYS] # SNF for all non-MRI data run_gridsearch(data=data[2:], hdf=hdf, path='processed/nomri', saveall=True, metrics='sqeuclidean') for n, key in enumerate(KEYS): # SNF for all modalities except one run_gridsearch(data=[d for i, d in enumerate(data) if i != n], hdf=hdf, path=f'processed/no_{key}', saveall=False, metrics='sqeuclidean') # SNF removing "holdout" behavioral variables (all + non-MRI) if 'behavioral_measures' in KEYS: idx = KEYS.index('behavioral_measures') data[idx] = data[idx].drop(['tremor', 'pigd'], axis=1) run_gridsearch(data=data, hdf=hdf, path='processed/holdout/all', saveall=False, metrics='sqeuclidean') run_gridsearch(data=data[2:], hdf=hdf, path='processed/holdout/nomri', saveall=False, metrics='sqeuclidean') # finally, run SNF for combined HC / PD subjects data = load_and_residualize_data(hdf) run_gridsearch(data=data, hdf=hdf, path='processed/pdhc', saveall=False, metrics='sqeuclidean')
def run_gridsearch(data, hdf, path, metrics=None, saveall=True): """ Runs gridsearch on `data` and saves outputs to `hdf`[`path`] Parameters ---------- data : list of array_like Data on which to run SNF gridsearch hdf : str or structures.Frog Filepath to or loaded structures.Frog object to save output data path : str Will be inserted into "/snf/{path}/{metric}/gridsearch", specifying the path in `hdf` to which gridsearch results will be saved metrics : list of str, optional Which distance metrics SNF should be run with. If not specified will use ['sqeuclidean', 'cityblock', 'cosine']. Default: None saveall : bool, optional Whether to save all outputs of gridsearch (i.e., all fused matrices, all clustering assignments, z-rand convolved similarity matrices, AND consensus clustering assignments) instead of only consensus clusters, average fused matrix, and agreement matrix. Default: True Returns ------- hdf : structures.Frog Same as provided input but with new gridsearch results! """ if metrics is None: metrics = ['sqeuclidean', 'cityblock', 'cosine'] elif isinstance(metrics, str): metrics = [metrics] if isinstance(hdf, str): hdf = structures.Frog(hdf) n_subj = len(data[0]) fname = op.basename(hdf.filename) print(f'Running grid-search for {fname} with {len(data)} datatypes; ' f'saving to path "{path}"') # set K / mu (hyperparameters) that we'll explore (10,000 combinations) K = np.arange(5, 105) mu = np.logspace(np.log10(0.3), np.log10(10), 100) # only consider two-, three-, and four-cluster solutions in this space n_clusters = [2, 3, 4] for metric in metrics: # check that the gridsearch wasn't already run for this combination. # no need to repeat needless computations! mpath = f'/snf/{path}/{metric}/gridsearch' if mpath in hdf.groups(): check = ['consensus', 'fusion_avg', 'agreement', 'embedding'] if saveall: check += ['fusion', 'labels', 'zrand'] if all(op.join(mpath, p) in hdf.keys() for p in check): continue # generate fused networks + cluster assignments for all the parameters print(f'Generating outputs from gridsearch with {metric} distance') fuse = delayed(fuse_and_label) gridres = Parallel(n_jobs=N_PROC)( fuse(data, k, m, n_clusters, metric) for k, m in tqdm.tqdm(list(itertools.product(K, mu)))) # wrangle outputs from gridsearch and reshape fusion, labels = [np.stack(f, axis=0) for f in zip(*gridres)] fusion = fusion.reshape(len(K), len(mu), n_subj, n_subj) labels = labels.reshape(len(K), len(mu), len(n_clusters), n_subj) # don't parallelize zrand_convolve across cluster solutions because # it's already parallelizing at a lower level print('Convolving cluster assignments with z-Rand kernel') zrand_avg = [ zrand_convolve(labels[..., n, :], n_proc=N_PROC) for n in range(len(n_clusters)) ] # make a record of all the gridsearch outputs if desired if saveall: results = dict(fusion=fusion, labels=labels, zrand=zrand_avg) else: results = dict() # we'll use the 95%ile of all the z-rand scores to threshold the # similarity matrices and extract "stable" regions as a mask of the # hyperparameter space zrand_thr = np.percentile(zrand_avg, 95) mask = [cluster_img_2d(z, zrand_thr)[1] != 0 for z in zrand_avg] zrand_mask = np.sum(mask, axis=0) > 0 # only keep assignments / fused networks from stable regions stable_idx = np.where(zrand_mask) labels, fusion = labels[stable_idx], fusion[stable_idx] # extract stable community assignments and make consensus comms = labels.reshape(-1, labels.shape[-1]).T cons, ag = cluster.find_consensus(comms, return_agreement=True, seed=1234) results['consensus'] = cons # run diffusion map embedding and generate average, aligned embedding embeddings = [dme(network, n_components=10) for network in fusion] realigned, xfms = align.iterative_alignment(embeddings, n_iters=1) results['embedding'] = np.mean(realigned, axis=0) # we'll keep the average fused network and the agreement matrix to # use for calculating modularity results['fusion_avg'] = np.mean(fusion, axis=0) results['agreement'] = ag hdf.save(results, mpath, overwrite=True) return hdf