def get_consensus_clusters(labels, zrand, verbose=True, seed=None):
    """
    Generates consensus cluster assignments from `labels` and `zrand`

    Parameters
    ----------
    labels : (K, M, C, N) array_like
        Cluster labels of `N` patients across SNF parameter space
    zrand : (C, K, M) array_like
        Local similarity of clustering solutions in SNF parameter space
    verbose : bool, optional
        Whether to print info about number of subjects in generated consensus
        clusters. Default: True
    seed : {None, int, np.random.RandomState}, optional
        Random seed for permutations. If not provided will use `SEED` specified
        at top of script. Default: None

    Returns
    -------
    assignments : (N, A) numpy.ndarray
        Cluster labels of `N` patients across `A` "stable" assignments
    consensus : (N,) numpy.ndarray
        Consensus clustering assignments for `N` patients
    agreement : (N, N) numpy.ndarray
        Agreement matrix containing probability of two patients being assigned
        to the same cluster across all `A` assignments
    """

    rs = np.random.RandomState(SEED if seed is None else seed)

    # only keep community assignments from stable regions
    mask = get_zrand_mask(zrand)
    assignments = labels[mask].reshape(-1, labels.shape[-1]).T

    # generate consensus assignments
    consensus, agreement = cluster.find_consensus(assignments,
                                                  null_func=np.mean,
                                                  return_agreement=True,
                                                  seed=rs)

    if verbose:
        grps, nums = np.unique(consensus, return_counts=True)
        print(f'Consensus clustering found {len(grps)} clusters: {nums}')

    return assignments, consensus, agreement
def perform_clustering(phenotypes, patient_phen, adult_patients,
                       pediatric_patients, logger):
    # get the index of unique phenotypes in the phenotype Dataframe
    mat_phen_ind = get_unique_phenotype(phenotypes)
    matrix_phen = phenotypes.drop("Patient ID", axis=1)

    # transform the phenotype dataframe to obtain a matrix of unique phenotypes, \
    # with only patients that have been evaluated,
    # with 1 if the phenotype is positively present, 0 if negative or NaN

    mat_phen_adult = matrix_phen.iloc[:, mat_phen_ind]
    mat_phen_adult = mat_phen_adult.loc[adult_patients]
    mat_phen_adult = mat_phen_adult.replace(to_replace={
        "Positive": 1,
        "Negative": 0,
        np.nan: 0
    })

    mat_phen_pediatric = matrix_phen.iloc[:, mat_phen_ind]
    mat_phen_pediatric = mat_phen_pediatric.loc[pediatric_patients]
    mat_phen_pediatric = mat_phen_pediatric.replace(to_replace={
        "Positive": 1,
        "Negative": 0,
        np.nan: 0
    })
    logger.info("Computing jaccard similarity matrix.")
    # we compute the jaccard similarity matrix for the phenotypic matrix, adult patients
    jac_sim_un_adult = 1 - pairwise_distances(mat_phen_adult, metric="jaccard")

    # we compute the jaccard similarity matrix for the phenotypic matrix, pediatric patients
    jac_sim_un_pediatric = 1 - pairwise_distances(mat_phen_pediatric,
                                                  metric="jaccard")

    # create networkx graphs for adult and pediatric network
    logger.info("Creating networkx graphs (long step).")
    # positions can be used to plot the graph in python environment
    graph_un_adult, pos_un_adult = graph_of_patients_js(
        adult_patients, jac_sim_un_adult)
    graph_un_pediatric, pos_un_pediatric = graph_of_patients_js(
        pediatric_patients, jac_sim_un_pediatric)

    # writes the computed graph in a gml format, to be able to use Gephi to analyze it further
    logger.info("Writing the graphs in a gml file for analysis with Gephi.")
    nx.write_gml(graph_un_adult, "graph_un_adult.gml")
    nx.write_gml(graph_un_pediatric, "graph_un_pediatric.gml")

    # performing clustering with Louvain method, resolutions 3 and 1.2
    logger.info("Performing clustering (long step).")
    consensus_matrix_ad, df_louvain_ad = get_consensus_matrix(
        adult_patients, graph_un_adult, 2, 10, logger)
    consensus_clustering_labels_ad = cluster.find_consensus(
        consensus_matrix_ad.values, seed=1234)
    consensus_matrix_ped, df_louvain_ped = get_consensus_matrix(
        pediatric_patients, graph_un_pediatric, 1.2, 10, logger)
    consensus_clustering_labels_ped = cluster.find_consensus(
        consensus_matrix_ped.values, seed=1234)

    clusters_ad = {
        cluster: []
        for cluster in np.unique(consensus_clustering_labels_ad)
    }
    for i, pat in enumerate(list(df_louvain_ad.index)):
        clusters_ad[consensus_clustering_labels_ad[i]].append(pat)

    clusters_ped = {
        cluster: []
        for cluster in np.unique(consensus_clustering_labels_ped)
    }
    for i, pat in enumerate(list(df_louvain_ped.index)):
        clusters_ped[consensus_clustering_labels_ped[i]].append(pat)

    # get indices of clusters for analysis and outliers
    ind_groups_adult = [
        cluster for cluster in clusters_ad if len(clusters_ad[cluster]) > 5
    ]
    ind_groups_ped = [
        cluster for cluster in clusters_ped if len(clusters_ped[cluster]) > 5
    ]
    ind_outliers_adult = [
        cluster for cluster in clusters_ad if len(clusters_ad[cluster]) <= 3
    ]
    ind_outliers_ped = [
        cluster for cluster in clusters_ped if len(clusters_ped[cluster]) <= 3
    ]

    return clusters_ad, clusters_ped, ind_groups_adult, ind_groups_ped, \
            ind_outliers_adult, ind_outliers_ped, consensus_clustering_labels_ad, consensus_clustering_labels_ped
Ejemplo n.º 3
0
def test_find_consensus(assignments, clusters):
    assert np.all(cluster.find_consensus(assignments) == clusters)
Ejemplo n.º 4
0
###############################################################################
# The Louvain algorithm is greedy so different instantiations will return
# different community assignments. We can run the algorithm ~100 times to see
# this discrepancy:

ci = [bct.community_louvain(nonegative, gamma=1.5)[0] for n in range(100)]

fig, ax = plt.subplots(1, 1, figsize=(6.4, 2))
ax.imshow(ci, cmap='Set1')
ax.set(ylabel='Assignments', xlabel='ROIs', xticklabels=[], yticklabels=[])

###############################################################################
# We'll provide these different assignments to our consensus-finding algorithm
# which will generate one final community assignment vector:

from netneurotools import cluster

consensus = cluster.find_consensus(np.column_stack(ci), seed=1234)
plotting.plot_mod_heatmap(corr, consensus, cmap='viridis')

###############################################################################
# The :func:`netneurotools.modularity.consensus_modularity` function provides a
# wrapper for this process of generating multiple community assignmenta via the
# Louvain algorithm and finding a consensus. It also generates and returns some
# metrics for assessing the quality of the community assignments.
#
# Nevertheless, the :func:`~.cluster.find_consensus` function is useful for
# generating a consensus clustering solution from the results of _any_
# clustering algorithm (not just Louvain).
def run_gridsearch(data, hdf, path, metrics=None, saveall=True):
    """
    Runs gridsearch on `data` and saves outputs to `hdf`[`path`]

    Parameters
    ----------
    data : list of array_like
        Data on which to run SNF gridsearch
    hdf : str or structures.Frog
        Filepath to or loaded structures.Frog object to save output data
    path : str
        Will be inserted into "/snf/{path}/{metric}/gridsearch", specifying
        the path in `hdf` to which gridsearch results will be saved
    metrics : list of str, optional
        Which distance metrics SNF should be run with. If not specified will
        use ['sqeuclidean', 'cityblock', 'cosine']. Default: None
    saveall : bool, optional
        Whether to save all outputs of gridsearch (i.e., all fused matrices,
        all clustering assignments, z-rand convolved similarity matrices, AND
        consensus clustering assignments) instead of only consensus clusters,
        average fused matrix, and agreement matrix. Default: True

    Returns
    -------
    hdf :  structures.Frog
        Same as provided input but with new gridsearch results!
    """

    if metrics is None:
        metrics = ['sqeuclidean', 'cityblock', 'cosine']
    elif isinstance(metrics, str):
        metrics = [metrics]

    if isinstance(hdf, str):
        hdf = structures.Frog(hdf)

    n_subj = len(data[0])
    fname = op.basename(hdf.filename)
    print(f'Running grid-search for {fname} with {len(data)} datatypes; '
          f'saving to path "{path}"')

    # set K / mu (hyperparameters) that we'll explore (10,000 combinations)
    K = np.arange(5, 105)
    mu = np.logspace(np.log10(0.3), np.log10(10), 100)
    # only consider two-, three-, and four-cluster solutions in this space
    n_clusters = [2, 3, 4]

    for metric in metrics:
        # check that the gridsearch wasn't already run for this combination.
        # no need to repeat needless computations!
        mpath = f'/snf/{path}/{metric}/gridsearch'
        if mpath in hdf.groups():
            check = ['consensus', 'fusion_avg', 'agreement', 'embedding']
            if saveall:
                check += ['fusion', 'labels', 'zrand']
            if all(op.join(mpath, p) in hdf.keys() for p in check):
                continue

        # generate fused networks + cluster assignments for all the parameters
        print(f'Generating outputs from gridsearch with {metric} distance')
        fuse = delayed(fuse_and_label)
        gridres = Parallel(n_jobs=N_PROC)(
            fuse(data, k, m, n_clusters, metric)
            for k, m in tqdm.tqdm(list(itertools.product(K, mu))))

        # wrangle outputs from gridsearch and reshape
        fusion, labels = [np.stack(f, axis=0) for f in zip(*gridres)]
        fusion = fusion.reshape(len(K), len(mu), n_subj, n_subj)
        labels = labels.reshape(len(K), len(mu), len(n_clusters), n_subj)

        # don't parallelize zrand_convolve across cluster solutions because
        # it's already parallelizing at a lower level
        print('Convolving cluster assignments with z-Rand kernel')
        zrand_avg = [
            zrand_convolve(labels[..., n, :], n_proc=N_PROC)
            for n in range(len(n_clusters))
        ]

        # make a record of all the gridsearch outputs if desired
        if saveall:
            results = dict(fusion=fusion, labels=labels, zrand=zrand_avg)
        else:
            results = dict()

        # we'll use the 95%ile of all the z-rand scores to threshold the
        # similarity matrices and extract "stable" regions as a mask of the
        # hyperparameter space
        zrand_thr = np.percentile(zrand_avg, 95)
        mask = [cluster_img_2d(z, zrand_thr)[1] != 0 for z in zrand_avg]
        zrand_mask = np.sum(mask, axis=0) > 0

        # only keep assignments / fused networks from stable regions
        stable_idx = np.where(zrand_mask)
        labels, fusion = labels[stable_idx], fusion[stable_idx]

        # extract stable community assignments and make consensus
        comms = labels.reshape(-1, labels.shape[-1]).T
        cons, ag = cluster.find_consensus(comms,
                                          return_agreement=True,
                                          seed=1234)
        results['consensus'] = cons

        # run diffusion map embedding and generate average, aligned embedding
        embeddings = [dme(network, n_components=10) for network in fusion]
        realigned, xfms = align.iterative_alignment(embeddings, n_iters=1)
        results['embedding'] = np.mean(realigned, axis=0)

        # we'll keep the average fused network and the agreement matrix to
        # use for calculating modularity
        results['fusion_avg'] = np.mean(fusion, axis=0)
        results['agreement'] = ag

        hdf.save(results, mpath, overwrite=True)

    return hdf