def get_consensus_clusters(labels, zrand, verbose=True, seed=None): """ Generates consensus cluster assignments from `labels` and `zrand` Parameters ---------- labels : (K, M, C, N) array_like Cluster labels of `N` patients across SNF parameter space zrand : (C, K, M) array_like Local similarity of clustering solutions in SNF parameter space verbose : bool, optional Whether to print info about number of subjects in generated consensus clusters. Default: True seed : {None, int, np.random.RandomState}, optional Random seed for permutations. If not provided will use `SEED` specified at top of script. Default: None Returns ------- assignments : (N, A) numpy.ndarray Cluster labels of `N` patients across `A` "stable" assignments consensus : (N,) numpy.ndarray Consensus clustering assignments for `N` patients agreement : (N, N) numpy.ndarray Agreement matrix containing probability of two patients being assigned to the same cluster across all `A` assignments """ rs = np.random.RandomState(SEED if seed is None else seed) # only keep community assignments from stable regions mask = get_zrand_mask(zrand) assignments = labels[mask].reshape(-1, labels.shape[-1]).T # generate consensus assignments consensus, agreement = cluster.find_consensus(assignments, null_func=np.mean, return_agreement=True, seed=rs) if verbose: grps, nums = np.unique(consensus, return_counts=True) print(f'Consensus clustering found {len(grps)} clusters: {nums}') return assignments, consensus, agreement
def perform_clustering(phenotypes, patient_phen, adult_patients, pediatric_patients, logger): # get the index of unique phenotypes in the phenotype Dataframe mat_phen_ind = get_unique_phenotype(phenotypes) matrix_phen = phenotypes.drop("Patient ID", axis=1) # transform the phenotype dataframe to obtain a matrix of unique phenotypes, \ # with only patients that have been evaluated, # with 1 if the phenotype is positively present, 0 if negative or NaN mat_phen_adult = matrix_phen.iloc[:, mat_phen_ind] mat_phen_adult = mat_phen_adult.loc[adult_patients] mat_phen_adult = mat_phen_adult.replace(to_replace={ "Positive": 1, "Negative": 0, np.nan: 0 }) mat_phen_pediatric = matrix_phen.iloc[:, mat_phen_ind] mat_phen_pediatric = mat_phen_pediatric.loc[pediatric_patients] mat_phen_pediatric = mat_phen_pediatric.replace(to_replace={ "Positive": 1, "Negative": 0, np.nan: 0 }) logger.info("Computing jaccard similarity matrix.") # we compute the jaccard similarity matrix for the phenotypic matrix, adult patients jac_sim_un_adult = 1 - pairwise_distances(mat_phen_adult, metric="jaccard") # we compute the jaccard similarity matrix for the phenotypic matrix, pediatric patients jac_sim_un_pediatric = 1 - pairwise_distances(mat_phen_pediatric, metric="jaccard") # create networkx graphs for adult and pediatric network logger.info("Creating networkx graphs (long step).") # positions can be used to plot the graph in python environment graph_un_adult, pos_un_adult = graph_of_patients_js( adult_patients, jac_sim_un_adult) graph_un_pediatric, pos_un_pediatric = graph_of_patients_js( pediatric_patients, jac_sim_un_pediatric) # writes the computed graph in a gml format, to be able to use Gephi to analyze it further logger.info("Writing the graphs in a gml file for analysis with Gephi.") nx.write_gml(graph_un_adult, "graph_un_adult.gml") nx.write_gml(graph_un_pediatric, "graph_un_pediatric.gml") # performing clustering with Louvain method, resolutions 3 and 1.2 logger.info("Performing clustering (long step).") consensus_matrix_ad, df_louvain_ad = get_consensus_matrix( adult_patients, graph_un_adult, 2, 10, logger) consensus_clustering_labels_ad = cluster.find_consensus( consensus_matrix_ad.values, seed=1234) consensus_matrix_ped, df_louvain_ped = get_consensus_matrix( pediatric_patients, graph_un_pediatric, 1.2, 10, logger) consensus_clustering_labels_ped = cluster.find_consensus( consensus_matrix_ped.values, seed=1234) clusters_ad = { cluster: [] for cluster in np.unique(consensus_clustering_labels_ad) } for i, pat in enumerate(list(df_louvain_ad.index)): clusters_ad[consensus_clustering_labels_ad[i]].append(pat) clusters_ped = { cluster: [] for cluster in np.unique(consensus_clustering_labels_ped) } for i, pat in enumerate(list(df_louvain_ped.index)): clusters_ped[consensus_clustering_labels_ped[i]].append(pat) # get indices of clusters for analysis and outliers ind_groups_adult = [ cluster for cluster in clusters_ad if len(clusters_ad[cluster]) > 5 ] ind_groups_ped = [ cluster for cluster in clusters_ped if len(clusters_ped[cluster]) > 5 ] ind_outliers_adult = [ cluster for cluster in clusters_ad if len(clusters_ad[cluster]) <= 3 ] ind_outliers_ped = [ cluster for cluster in clusters_ped if len(clusters_ped[cluster]) <= 3 ] return clusters_ad, clusters_ped, ind_groups_adult, ind_groups_ped, \ ind_outliers_adult, ind_outliers_ped, consensus_clustering_labels_ad, consensus_clustering_labels_ped
def test_find_consensus(assignments, clusters): assert np.all(cluster.find_consensus(assignments) == clusters)
############################################################################### # The Louvain algorithm is greedy so different instantiations will return # different community assignments. We can run the algorithm ~100 times to see # this discrepancy: ci = [bct.community_louvain(nonegative, gamma=1.5)[0] for n in range(100)] fig, ax = plt.subplots(1, 1, figsize=(6.4, 2)) ax.imshow(ci, cmap='Set1') ax.set(ylabel='Assignments', xlabel='ROIs', xticklabels=[], yticklabels=[]) ############################################################################### # We'll provide these different assignments to our consensus-finding algorithm # which will generate one final community assignment vector: from netneurotools import cluster consensus = cluster.find_consensus(np.column_stack(ci), seed=1234) plotting.plot_mod_heatmap(corr, consensus, cmap='viridis') ############################################################################### # The :func:`netneurotools.modularity.consensus_modularity` function provides a # wrapper for this process of generating multiple community assignmenta via the # Louvain algorithm and finding a consensus. It also generates and returns some # metrics for assessing the quality of the community assignments. # # Nevertheless, the :func:`~.cluster.find_consensus` function is useful for # generating a consensus clustering solution from the results of _any_ # clustering algorithm (not just Louvain).
def run_gridsearch(data, hdf, path, metrics=None, saveall=True): """ Runs gridsearch on `data` and saves outputs to `hdf`[`path`] Parameters ---------- data : list of array_like Data on which to run SNF gridsearch hdf : str or structures.Frog Filepath to or loaded structures.Frog object to save output data path : str Will be inserted into "/snf/{path}/{metric}/gridsearch", specifying the path in `hdf` to which gridsearch results will be saved metrics : list of str, optional Which distance metrics SNF should be run with. If not specified will use ['sqeuclidean', 'cityblock', 'cosine']. Default: None saveall : bool, optional Whether to save all outputs of gridsearch (i.e., all fused matrices, all clustering assignments, z-rand convolved similarity matrices, AND consensus clustering assignments) instead of only consensus clusters, average fused matrix, and agreement matrix. Default: True Returns ------- hdf : structures.Frog Same as provided input but with new gridsearch results! """ if metrics is None: metrics = ['sqeuclidean', 'cityblock', 'cosine'] elif isinstance(metrics, str): metrics = [metrics] if isinstance(hdf, str): hdf = structures.Frog(hdf) n_subj = len(data[0]) fname = op.basename(hdf.filename) print(f'Running grid-search for {fname} with {len(data)} datatypes; ' f'saving to path "{path}"') # set K / mu (hyperparameters) that we'll explore (10,000 combinations) K = np.arange(5, 105) mu = np.logspace(np.log10(0.3), np.log10(10), 100) # only consider two-, three-, and four-cluster solutions in this space n_clusters = [2, 3, 4] for metric in metrics: # check that the gridsearch wasn't already run for this combination. # no need to repeat needless computations! mpath = f'/snf/{path}/{metric}/gridsearch' if mpath in hdf.groups(): check = ['consensus', 'fusion_avg', 'agreement', 'embedding'] if saveall: check += ['fusion', 'labels', 'zrand'] if all(op.join(mpath, p) in hdf.keys() for p in check): continue # generate fused networks + cluster assignments for all the parameters print(f'Generating outputs from gridsearch with {metric} distance') fuse = delayed(fuse_and_label) gridres = Parallel(n_jobs=N_PROC)( fuse(data, k, m, n_clusters, metric) for k, m in tqdm.tqdm(list(itertools.product(K, mu)))) # wrangle outputs from gridsearch and reshape fusion, labels = [np.stack(f, axis=0) for f in zip(*gridres)] fusion = fusion.reshape(len(K), len(mu), n_subj, n_subj) labels = labels.reshape(len(K), len(mu), len(n_clusters), n_subj) # don't parallelize zrand_convolve across cluster solutions because # it's already parallelizing at a lower level print('Convolving cluster assignments with z-Rand kernel') zrand_avg = [ zrand_convolve(labels[..., n, :], n_proc=N_PROC) for n in range(len(n_clusters)) ] # make a record of all the gridsearch outputs if desired if saveall: results = dict(fusion=fusion, labels=labels, zrand=zrand_avg) else: results = dict() # we'll use the 95%ile of all the z-rand scores to threshold the # similarity matrices and extract "stable" regions as a mask of the # hyperparameter space zrand_thr = np.percentile(zrand_avg, 95) mask = [cluster_img_2d(z, zrand_thr)[1] != 0 for z in zrand_avg] zrand_mask = np.sum(mask, axis=0) > 0 # only keep assignments / fused networks from stable regions stable_idx = np.where(zrand_mask) labels, fusion = labels[stable_idx], fusion[stable_idx] # extract stable community assignments and make consensus comms = labels.reshape(-1, labels.shape[-1]).T cons, ag = cluster.find_consensus(comms, return_agreement=True, seed=1234) results['consensus'] = cons # run diffusion map embedding and generate average, aligned embedding embeddings = [dme(network, n_components=10) for network in fusion] realigned, xfms = align.iterative_alignment(embeddings, n_iters=1) results['embedding'] = np.mean(realigned, axis=0) # we'll keep the average fused network and the agreement matrix to # use for calculating modularity results['fusion_avg'] = np.mean(fusion, axis=0) results['agreement'] = ag hdf.save(results, mpath, overwrite=True) return hdf