def leiden_clustering(umap_res, resolution_range=(0, 1), random_state=2, kdtree_dist='euclidean'): tree = neighbors.KDTree(umap_res, metric=kdtree_dist) vals, i, j = [], [], [] for idx in range(umap_res.shape[0]): dist, ind = tree.query([umap_res[idx]], k=25) vals.extend(list(dist.squeeze())) j.extend(list(ind.squeeze())) i.extend([idx] * len(ind.squeeze())) print(len(vals)) ginput = sps.csc_matrix( (numpy.array(vals), (numpy.array(i), numpy.array(j))), shape=(umap_res.shape[0], umap_res.shape[0])) sources, targets = ginput.nonzero() edgelist = zip(sources.tolist(), targets.tolist()) G = ig.Graph(edges=list(edgelist)) optimiser = leidenalg.Optimiser() optimiser.set_rng_seed(random_state) profile = optimiser.resolution_profile(G, leidenalg.CPMVertexPartition, resolution_range=resolution_range, number_iterations=0) print([len(elt) for elt in profile]) return profile
def fit(self): '''Compute communities from a matrix with fixed nodes Returns: None, but the membership attribute is set as an array of int with size N - n_fixed with the community/cluster membership of all columns except the first n fixed ones. ''' self._parse_graph() aa = self.annotations n_fixed = len(aa) g = self.graph N = g.vcount() opt = leidenalg.Optimiser() fixed_nodes = [int(i < n_fixed) for i in range(N)] # NOTE: initial membership is singletons except for atlas nodes, which # get the membership they have. aau = list(np.unique(aa)) aaun = len(aau) initial_membership = [] for j in range(N): if j < n_fixed: mb = aau.index(aa[j]) else: mb = aaun + (j - n_fixed) initial_membership.append(mb) if self.metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=self.resolution_parameter, initial_membership=initial_membership, ) elif self.metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=self.resolution_parameter, initial_membership=initial_membership, ) else: raise ValueError('clustering_metric not understood: {:}'.format( self.metric)) # Run modified Leiden here opt.optimise_partition(partition, fixed_nodes=fixed_nodes) # Exctract result membership = partition.membership[n_fixed:] # Convert the known cell types lstring = len(max(aau, key=len)) self.membership = np.array([str(x) for x in membership], dtype='U{:}'.format(lstring)) for i, ct in enumerate(aau): self.membership[self.membership == str(i)] = ct
def _partition_graph(self, resolution): # https://github.com/vtraag/4TU-CSS/ part, part0, part1 = la.CPMVertexPartition.Bipartite( self.graph, resolution_parameter_01=resolution ) opt = la.Optimiser() opt.optimise_partition_multiplex( [part, part0, part1], layer_weights=[1, -1, -1], n_iterations=100 ) return part
def CPM_Bipartite(g_original, resolution_parameter_01, resolution_parameter_0=0, resolution_parameter_1=0, degree_as_node_size=False, seed=0): """ CPM_Bipartite is the extension of CPM to bipartite graphs :param g_original: a networkx/igraph object :param resolution_parameter_01: Resolution parameter for in between two classes. :param resolution_parameter_0: Resolution parameter for class 0. :param resolution_parameter_1: Resolution parameter for class 1. :param degree_as_node_size: If ``True`` use degree as node size instead of 1, to mimic modularity :param seed: the random seed to be used in CPM method to keep results/partitions replicable :return: BiNodeClustering object :Example: >>> from cdlib import algorithms >>> import networkx as nx >>> G = nx.algorithms.bipartite.generators.random_graph(100, 20, 0.5) >>> coms = algorithms.CPM_Bipartite(G, 1) :References: Barber, M. J. (2007). Modularity and community detection in bipartite networks. Physical Review E, 76(6), 066102. 10.1103/PhysRevE.76.066102 .. note:: Reference implementation: https://leidenalg.readthedocs.io/en/stable/multiplex.html?highlight=bipartite#bipartite """ if ig is None or leidenalg is None: raise ModuleNotFoundError("Optional dependency not satisfied: install igraph and leidenalg to use the " "selected feature.") g = convert_graph_formats(g_original, ig.Graph) try: g.vs['name'] except: g.vs['name'] = [v.index for v in g.vs] optimiser = leidenalg.Optimiser() leidenalg.Optimiser.set_rng_seed(self=optimiser, value=seed) p_01, p_0, p_1 = leidenalg.CPMVertexPartition.Bipartite(g, resolution_parameter_01=resolution_parameter_01, resolution_parameter_0=resolution_parameter_0, resolution_parameter_1=resolution_parameter_1, degree_as_node_size=degree_as_node_size) optimiser.optimise_partition_multiplex([p_01, p_0, p_1], layer_weights=[1, -1, -1]) coms = defaultdict(list) for n in g.vs: coms[p_01.membership[n.index]].append(n.index) return BiNodeClustering(list(coms.values()), [], g_original, "CPM_Bipartite", method_parameters={"resolution_parameter_0": resolution_parameter_01, "resolution_parameter_0": resolution_parameter_0, "resolution_parameter_1": resolution_parameter_1, "degree_as_node_size": degree_as_node_size, "seed": seed})
def community_consensus_iterative(self, C): ## function finding the consensus of a given set of partitions. refer to the paper: ## 'Robust detection of dynamic community structure in networks', Danielle S. Bassett, ## Mason A. Porter, Nicholas F. Wymbs, Scott T. Grafton, Jean M. Carlson et al. npart, m = C.shape C_rand3 = np.zeros((C.shape)) #permuted version of C X = np.zeros((m, m)) #Nodal association matrix for C X_rand3 = X # Random nodal association matrix for C_rand3 # randomly permute rows of C for i in range(npart): C_rand3[i, :] = C[i, np.random.permutation(m)] for k in range(m): for p in range(m): if int(C[i, k]) == int(C[i, p]): X[p, k] = X[ p, k] + 1 #(i,j) is the # of times node i and j are assigned in the same comm if int(C_rand3[i, k]) == int(C_rand3[i, p]): X_rand3[p, k] = X_rand3[ p, k] + 1 #(i,j) is the # of times node i and j are expected to be assigned in the same comm by chance #thresholding #keep only associated assignments that occur more often than expected in the random data X_new3 = np.zeros((m, m)) X_new3[X > (np.max(np.triu(X_rand3, 1))) / 2] = X[X > (np.max(np.triu(X_rand3, 1))) / 2] ##turn thresholded nodal association matrix into igraph edge_list = [] weight_list = [] for k, e in enumerate(np.transpose(np.nonzero(X_new3))): i, j = e[0], e[1] pair = (i, j) edge_list.append(pair) weight_list.append(X_new3[i][j]) G = ig.Graph() G.add_vertices(m) G.add_edges(edge_list) G.es['weight'] = weight_list G.vs['id'] = list(range(m)) optimiser = la.Optimiser() partition = la.ModularityVertexPartition(G, weights='weight') diff = optimiser.optimise_partition(partition, n_iterations=-1) return (partition)
def test_optimiser_with_fixed_nodes(self): G = ig.Graph.Full(3) partition = leidenalg.CPMVertexPartition(G, resolution_parameter=0.01, initial_membership=[2, 1, 0]) # Equivalent to setting initial membership #partition.set_membership([2, 1, 2]) opt = leidenalg.Optimiser() fixed_nodes = [True, False, False] opt.optimise_partition(partition, fixed_nodes=fixed_nodes) self.assertListEqual( partition.membership, [2, 2, 2], msg= "After optimising partition with fixed nodes failed to recover initial fixed memberships" )
def leiden(self, G, interslice, resolution): layers, interslice_layer, G_full = la.time_slices_to_layers(G, interslice_weight = interslice) partitions = [la.RBConfigurationVertexPartition(H, weights = 'weight', resolution_parameter = resolution) for H in layers] interslice_partition = la.RBConfigurationVertexPartition(interslice_layer, weights = 'weight', resolution_parameter = 0) optimiser = la.Optimiser() diff = optimiser.optimise_partition_multiplex(partitions + [interslice_partition]) return(partitions, interslice_partition)
def _partition_graph(self, resolution: float, seed: int) -> ig.VertexClustering: if self.graph.is_bipartite(): part, part0, part1 = la.CPMVertexPartition.Bipartite( self.graph, resolution_parameter_01=resolution, weights="weight" ) opt = la.Optimiser() opt.set_rng_seed(seed) opt.optimise_partition_multiplex( [part, part0, part1], layer_weights=[1, -1, -1], n_iterations=-1 ) else: part = la.find_partition( self.graph, la.ModularityVertexPartition, weights="weight", n_iterations=-1, seed=seed, ) return part
def runCommunityWithMultiplex(self): """ Primary method for now TODO: update with moer features Currently takes all forests in hypha (defined by nodes on shared edge set) and finds communities """ print('running community detection') optimizer = la.Optimiser() netlist = [] all_nodes = set() for pat, vals in self.forests.items(): all_nodes.update( [self.interactome.vs['name'][i] for i in vals['vert']]) print("Have", len(all_nodes), 'total nodes') for_graph = self.interactome #.copy()#.subgraph(all_nodes) for nx_g in self.forests.values( ): ##i'm not convince the forests have the same node/edge indices tmp_g = for_graph.subgraph_edges(nx_g['edge'],\ delete_vertices=False)#Graph()#nx_g.copy() ###this is apointer, make sur netlist.append(tmp_g) print('Added network of', len(tmp_g.vs), 'vertices and', len(tmp_g.es), 'edges') [membership, improv] = la.find_partition_multiplex(netlist,\ #la.RBERVertexPartition) la.ModularityVertexPartition) comm_df = pd.DataFrame({'Node': for_graph.vs['name'],\ 'Community': membership}) comm_counts = comm_df.groupby("Community")['Node'].count() comm_dict = dict(comm_df.groupby('Community')['Node'].apply(list)) red_list = [comm_dict[c] for c in comm_counts.index[comm_counts > 5]] red_dict = dict(zip(comm_counts.index[comm_counts > 5], red_list)) red_graph = {} for comm, vals in red_dict.items(): rgraph = self.interactome.subgraph(vals) red_graph[comm] = rgraph print("Community", comm, " graph has", len(vals),'proteins and',\ len(rgraph.components()), 'component') return red_dict, red_graph
def run_alg(Gs, alg, gamma=1.0, sample=1.0, layer_weights=None): ''' Run community detection algorithm with a resolution parameter. Right now only use RB in Louvain/Leiden Parameters ---------- Gs : a list of igraph.Graph alg : str choose between 'louvain' and 'leiden' gamma : float resolution parameter sample : if smaller than 1, randomly delete a fraction of edges each time layer_weights: a list of float specifying layer weights in the multilayer setting Returns ------ C: scipy.sparse.csr_matrix a matrix recording the membership of each cluster ''' if len(Gs) == 1: G = Gs[0] G1 = G.copy() if sample < 1: G1 = network_perturb(G, sample) if alg == 'louvain': partition_type = louvain.RBConfigurationVertexPartition partition = louvain.find_partition(G1, partition_type, resolution_parameter=gamma) elif alg == 'leiden': partition_type = leidenalg.RBConfigurationVertexPartition partition = leidenalg.find_partition(G1, partition_type, resolution_parameter=gamma) partitions = [partition] else: # multiplex mode if layer_weights == None: layer_weights = [1.0 for _ in Gs] assert len(layer_weights) == len( Gs), 'layer weights inconsistent with the number of input networks' Gs1 = [G.copy() for G in Gs] if sample < 1: Gs1 = [network_perturb(G, sample) for G in Gs] if alg == 'louvain': partition_type = louvain.RBConfigurationVertexPartition optimiser = louvain.Optimiser() partitions = [ partition_type(G, resolution_parameter=gamma) for G in Gs1 ] _ = optimiser.optimise_partition_multiplex( partitions, layer_weights=layer_weights) elif alg == 'leiden': partition_type = leidenalg.RBConfigurationVertexPartition # partition = leidenalg.find_partition_multiplex(Gs1, partition_type, resolution_parameter=gamma, # layer_weights=layer_weights) optimiser = leidenalg.Optimiser() partitions = [ partition_type(G, resolution_parameter=gamma) for G in Gs1 ] _ = optimiser.optimise_partition_multiplex( partitions, n_iterations=-1, layer_weights=layer_weights ) # -1 means iterate until no further optimization # print([len(p) for p in partitions]) # debug # partition = sorted(partition, key=len, reverse=True) LOGGER.info('Resolution: {:.4f}; find {} clusters'.format( gamma, len(partitions[0]))) return partition_to_membership_matrix(partitions[0])
def spectral_leiden( data: MultimodalData, rep: str = "pca", resolution: float = 1.3, rep_kmeans: str = "diffmap", n_clusters: int = 30, n_clusters2: int = 50, n_init: int = 10, n_jobs: int = -1, random_state: int = 0, class_label: str = "spectral_leiden_labels", ) -> None: """Cluster the data using Spectral Leiden algorithm. [Li20]_ Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters. rep_kmeans: ``str``, optional, default: ``"diffmap"`` The embedding representation on which the KMeans runs. Keyword must exist in ``data.obsm``. By default, use Diffusion Map coordinates. If diffmap is not calculated, use PCA coordinates instead. n_clusters: ``int``, optional, default: ``30`` The number of first level clusters. n_clusters2: ``int``, optional, default: ``50`` The number of second level clusters. n_init: ``int``, optional, default: ``10`` Number of kmeans tries for the first level clustering. Default is set to be the same as scikit-learn Kmeans function. n_jobs : `int`, optional (default: -1) Number of threads to use for the KMeans step. -1 refers to using all physical CPU cores. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. class_label: ``str``, optional, default: ``"spectral_leiden_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels for cells as categorical data. Examples -------- >>> pg.spectral_leiden(data) """ try: import leidenalg except ImportError: import sys logger.error("Need leidenalg! Try 'pip install leidenalg'.") sys.exit(-1) if f"X_{rep_kmeans}" not in data.obsm.keys(): logger.warning( f"{rep_kmeans} is not calculated, switch to pca instead.") rep_kmeans = "pca" if f"X_{rep_kmeans}" not in data.obsm.keys(): raise ValueError(f"Please run {rep_kmeans} first!") if f"W_{rep}" not in data.obsp: raise ValueError( "Cannot find affinity matrix. Please run neighbors first!") labels = partition_cells_by_kmeans( data.obsm[f"X_{rep_kmeans}"], n_clusters, n_clusters2, n_init, n_jobs, random_state, ) W = data.obsp[f"W_{rep}"] G = construct_graph(W) partition_type = leidenalg.RBConfigurationVertexPartition partition = partition_type(G, resolution_parameter=resolution, weights="weight", initial_membership=labels) partition_agg = partition.aggregate_partition() optimiser = leidenalg.Optimiser() optimiser.set_rng_seed(random_state) diff = optimiser.optimise_partition(partition_agg, -1) partition.from_coarse_partition(partition_agg) labels = np.array([str(x + 1) for x in partition.membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) data.register_attr(class_label, "cluster") n_clusters = data.obs[class_label].cat.categories.size logger.info( f"Spectral Leiden clustering is done. Get {n_clusters} clusters.")
def spectral_leiden( data: AnnData, rep: str = "pca", resolution: float = 1.3, rep_kmeans: str = "diffmap", n_clusters: int = 30, n_clusters2: int = 50, n_init: int = 10, n_jobs: int = -1, random_state: int = 0, class_label: str = "spectral_leiden_labels", ) -> None: """Cluster the data using Spectral Leiden algorithm. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. resolution: ``int``, optional, default: ``1.3`` Resolution factor. Higher resolution tends to find more clusters. rep_kmeans: ``str``, optional, default: ``"diffmap"`` The embedding representation on which the KMeans runs. Keyword must exist in ``data.obsm``. By default, use Diffusion Map coordinates. If diffmap is not calculated, use PCA coordinates instead. n_clusters: ``int``, optional, default: ``30`` The number of first level clusters. n_clusters2: ``int``, optional, default: ``50`` The number of second level clusters. n_init: ``int``, optional, default: ``10`` Number of kmeans tries for the first level clustering. Default is set to be the same as scikit-learn Kmeans function. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. temp_folder: ``str``, optional, default: ``None`` Temporary folder name for joblib to use during the computation. class_label: ``str``, optional, default: ``"spectral_leiden_labels"`` Key name for storing cluster labels in ``data.obs``. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[class_label]``: Cluster labels for cells as categorical data. Examples -------- >>> pg.spectral_leiden(adata) """ start = time.time() if "X_" + rep_kmeans not in data.obsm.keys(): logger.warning("{} is not calculated, switch to pca instead.".format(rep_kmeans)) rep_kmeans = "pca" if "X_" + rep_kmeans not in data.obsm.keys(): raise ValueError("Please run {} first!".format(rep_kmeans)) if "W_" + rep not in data.uns: raise ValueError("Cannot find affinity matrix. Please run neighbors first!") labels = partition_cells_by_kmeans( data, rep_kmeans, n_jobs, n_clusters, n_clusters2, n_init, random_state, ) W = data.uns["W_" + rep] G = construct_graph(W) partition_type = leidenalg.RBConfigurationVertexPartition partition = partition_type( G, resolution_parameter=resolution, weights="weight", initial_membership=labels ) partition_agg = partition.aggregate_partition() optimiser = leidenalg.Optimiser() optimiser.set_rng_seed(random_state) diff = optimiser.optimise_partition(partition_agg, -1) partition.from_coarse_partition(partition_agg) labels = np.array([str(x + 1) for x in partition.membership]) categories = natsorted(np.unique(labels)) data.obs[class_label] = pd.Categorical(values=labels, categories=categories) end = time.time() logger.info( "Spectral Leiden clustering is done. Time spent = {:.2f}s.".format( end - start ) )
def compute_communities(self): '''Compute communities from a matrix with fixed nodes Returns: None, but SemiAnnotate.membership is set as an array of int with size N - n_fixed with the community/cluster membership of all columns except the first n_fixed ones. ''' import inspect import igraph as ig import leidenalg # Check whether this version of Leiden has fixed nodes support opt = leidenalg.Optimiser() sig = inspect.getfullargspec(opt.optimise_partition) if 'fixed_nodes' not in sig.args: raise ImportError( 'This version of the leidenalg module does not support fixed nodes. Please update to a later (development) version' ) matrix = self.matrix aa = self.cell_types aau = np.unique(aa) n_fixed = self.n_fixed clustering_metric = self.clustering_metric resolution_parameter = self.resolution_parameter neighbors = self.neighbors L, N = matrix.shape # Construct graph from the lists of neighbors edges_d = set() for i, neis in enumerate(neighbors): for n in neis: edges_d.add(frozenset((i, n))) edges = [tuple(e) for e in edges_d] g = ig.Graph(n=N, edges=edges, directed=False) # NOTE: initial membership is singletons except for atlas nodes, which # get the membership they have. aaun = len(aau) initial_membership = [] for j in range(N): if j < self.n_fixed: mb = aau.index(aa[j]) else: mb = aaun + (j - n_fixed) initial_membership.append(mb) # Compute communities with semi-supervised Leiden if clustering_metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) elif clustering_metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) else: raise ValueError('clustering_metric not understood: {:}'.format( clustering_metric)) fixed_nodes = [int(i < n_fixed) for i in range(N)] opt.optimise_partition(partition, fixed_nodes=fixed_nodes) membership = partition.membership[n_fixed:] # Convert the known cell types lstring = len(max(self.cell_types, key=len)) self.membership = np.array([str(x) for x in membership], dtype='U{:}'.format(lstring)) for i, ct in enumerate(self.cell_types): self.membership[self.membership == str(i)] = ct
mob['month'] = [x.month for x in mob['date']] months = list(np.unique([x.month for x in mob['date']])) #%% mob = mob.groupby(['month', 'journey', 'start_quadkey', 'end_quadkey']).sum()['n_crisis'].reset_index() #%% #mob = mob.loc[[x in dates for x in mob['month']], :] mob = mob.groupby('month') mob = [mob.get_group(x) for x in mob.groups] assert len(months) == len(mob) #%% mob #%% hierarchy = {} for i, month in enumerate(months): optimiser = leidenalg.Optimiser() G = od_igraph(mob[i]) rp = optimiser.resolution_profile(G, leidenalg.CPMVertexPartition, min_diff_resolution = 0.01, resolution_range=(0,1), weights='weight') hierarchy[month] = rp #%% assert len(months) == len(hierarchy) hierarchy #%% #redo this - one at a time - then abstract #in functions not loop list compression is faster? def collapse_clusters(partition, G):
def community_detection(eID, probe="both", bin=0.02, sensitivity=1, visual=False, feedbackType=None, user_start="trial_start", user_end="trial_end", region_list=[], difficulty=[-1, 1], percentage=1, data=None): """ Function: Takes an experiment ID and makes community detection analysis Parameters: eID: experiment ID probe: name of the probe wanted or both for both probes bin: the size of the bin sensitivity: the sensibility parameter for the leiden algorithm visual: a boolean on whether visualization is wanted feedbackType: value for feedback wanted starts: the name of the type of start intervals ends: the name of the type of end intervals Return: partition: ig graph vertex partition object partition_dictionary: a dictionary with keys for each community and sets as values with the indices of the clusters that belong to that community, and the key region_dict: dictionary keyed by community number and value of a dictionary with the names of the brain regions of that community and their frequency locations: a list of the locations for each cluster Example: without a know path: >>>community_detection( >>community_detection( exp_ID, visual=True, probe="probe00", start="stimOn_times", end="response_times", )s with a known path "\\directory\\": community_detection( exp_ID, visual=True, path="\\directory\\" probe="probe00", start="stimOn_times", end="response_times", ) """ if not bool(data): spikes, clusters, trials, locations = djl.loading( eID, probe, region_list) else: spikes, clusters, trials, locations = data starts, ends = section_trial(user_start, user_end, trials, feedbackType, difficulty, percentage) spikes_interval, clusters_interval = spp.interval_selection( spikes, clusters, starts, ends) spikes_matrix = bb.processing.bincount2D( spikes_interval, clusters_interval, xbin=bin, # xlim=[0, nclusters] )[0] spikes_matrix_fixed = spp.addition_of_empty_neurons( spikes_matrix, clusters, clusters_interval) correlation_matrix_original = np.corrcoef(spikes_matrix_fixed) correlation_matrix = correlation_matrix_original[:, :] correlation_matrix[correlation_matrix < 0] = 0 np.fill_diagonal(correlation_matrix, 0) neuron_graph = ig.Graph.Weighted_Adjacency(correlation_matrix.tolist(), mode="UNDIRECTED") neuron_graph.vs["label"] = [f"{i}" for i in range(np.max(clusters))] if sensitivity != 1: partition = la.RBConfigurationVertexPartition( neuron_graph, resolution_parameter=sensitivity) optimiser = la.Optimiser() optimiser.optimise_partition(partition) else: partition = la.find_partition(neuron_graph, la.ModularityVertexPartition) visualization(neuron_graph, partition) if visual else None partition_dictionary = dictionary_from_communities(partition) region_dict = location_dictionary(partition_dictionary, locations) return partition, partition_dictionary, region_dict, locations
def leiden( self, axis, edges, edge_weights=None, metric='cpm', resolution_parameter=0.001, initial_membership=None, fixed_nodes=None, ): '''Graph-based Leiden clustering Args: axis (string): It must be 'samples' or 'features'. The Dataset.counts matrix is used and either samples or features are clustered. edges (list of pairs): list of edges to make a graph used to cluster. Each member of a pair is an int referring to the index of the sample or feature in the sample/featuresheet. edge_weights (list of float or None): edge weights to use for clustering. If None, all edge weights are 1. metric (str): What metric to optimize. Can be 'modularity' or 'cpm'. resolution_parameter (float): a number between 0 and 1 that sets how easy it is to call new clusters. initial_membership (str or None): name of a metadata column containing the initial membership vector for the clustering. If None (default), each samples starts as a singleton fixed_nodes (str or None): name of a metadata column containing a boolean vector for which nodes are not allowed to change cluster membership during the Leiden algorithm. Your version of leidenalg must support fixed nodes for this feature to work. Returns: pd.Series with the labels of the clusters. ''' import igraph as ig import leidenalg if axis == 'samples': n_nodes = self.dataset.n_samples index = self.dataset.samplenames elif axis == 'features': n_nodes = self.dataset.n_features index = self.dataset.featurenames g = ig.Graph(n=n_nodes, edges=edges, directed=False) if edge_weights is not None: g.es['weight'] = edge_weights if initial_membership is not None: if axis == 'samples': im = self.dataset.samplesheet[ initial_membership].values.astype(int) else: im = self.dataset.featuresheet[ initial_membership].values.astype(int) else: im = np.arange(n_nodes) im = list(im) if metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=im, ) elif metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=im, ) else: raise ValueError( 'clustering_metric not understood: {:}'.format(metric)) opt = leidenalg.Optimiser() if fixed_nodes is not None: if axis == 'samples': fxn = self.dataset.samplesheet[fixed_nodes].values.astype(int) else: fxn = self.dataset.featuresheet[fixed_nodes].values.astype(int) fxn = list(fxn) opt.optimise_partition(partition, fixed_nodes=fxn) else: opt.optimise_partition(partition) communities = partition.membership labels = pd.Series(communities, index=index) return labels
def cluster_graph(self): '''Compute communities from a matrix with fixed nodes Returns: None, but Averages.membership is set as an array with size N - n_fixed with the atlas cell types of all cells from the new dataset. ''' import inspect import leidenalg # Check whether this version of Leiden has fixed nodes support opt = leidenalg.Optimiser() sig = inspect.getfullargspec(opt.optimise_partition) if 'fixed_nodes' not in sig.args: raise ImportError('This version of the leidenalg module does not support fixed nodes. Please update to a later (development) version') matrix = self.matrix sizes = self.sizes n_fixed = self.n_fixed clustering_metric = self.clustering_metric resolution_parameter = self.resolution_parameter g = self.graph L, N = matrix.shape n_fixede = int(np.sum(sizes[:n_fixed])) Ne = int(np.sum(sizes)) # NOTE: initial membership is singletons except for atlas nodes, which # get the membership they have. initial_membership = [] for isi in range(N): if isi < n_fixed: for ii in range(int(self.sizes[isi])): initial_membership.append(isi) else: initial_membership.append(isi) if len(initial_membership) != Ne: raise ValueError('initial_membership list has wrong length!') # Compute communities with semi-supervised Leiden if clustering_metric == 'cpm': partition = leidenalg.CPMVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) elif clustering_metric == 'modularity': partition = leidenalg.ModularityVertexPartition( g, resolution_parameter=resolution_parameter, initial_membership=initial_membership, ) else: raise ValueError( 'clustering_metric not understood: {:}'.format(clustering_metric)) fixed_nodes = [int(i < n_fixede) for i in range(Ne)] opt.optimise_partition(partition, fixed_nodes=fixed_nodes) membership = partition.membership[n_fixede:] # Convert the known cell types lstring = len(max(self.cell_types, key=len)) self.membership = np.array( [str(x) for x in membership], dtype='U{:}'.format(lstring)) for i, ct in enumerate(self.cell_types): self.membership[self.membership == str(i)] = ct
def setUp(self): self.optimiser = leidenalg.Optimiser()
def resolution_profile(N, sources, targets, partition_type, resolution_range, weights=None, min_diff_bisect_value=1, min_diff_resolution=0.001, linear_bisection=False, number_iterations=1, rng_seed=None): # graph from adjacency matrix (passed in as sparse representation: N, sources, targets, weights) g = ig.Graph(directed=True) g.add_vertices(N) # this adds adjacency.shape[0] vertices g.add_edges(list(zip(sources, targets))) # handle the possible partitions and their allowed args partition_kwargs = {} if partition_type is None: partition_type = la.RBConfigurationVertexPartition elif partition_type == "RBC": partition_type = la.RBConfigurationVertexPartition node_sizes = None elif partition_type == "RBERVertexPartition": partition_type = la.RBERVertexPartition elif partition_type == "CPM": partition_type = la.CPMVertexPartition else: print( f"bad partition_type: {partition_type}. Using RBConfigurationVertexPartition" ) partition_type = la.RBConfigurationVertexPartition if weights is not None: g.es['weight'] = weights if node_sizes is not None: partition_kwargs['node_sizes'] = node_sizes start_time = time.time() optimiser = la.Optimiser() if rng_seed is not None: optimiser.set_rng_seed(rng_seed) # resolution_profile(graph, partition_type, resolution_range, weights=None, bisect_func=<function Optimiser.<lambda>>, # min_diff_bisect_value=1, min_diff_resolution=0.001, linear_bisection=False, number_iterations=1, **kwargs) profile = optimiser.resolution_profile( g, partition_type, resolution_range, weights=weights, min_diff_bisect_value=min_diff_bisect_value, min_diff_resolution=min_diff_resolution, linear_bisection=linear_bisection, number_iterations=number_iterations, **partition_kwargs) print(f"leidenalg took {time.time()-start_time}s") # print(f"number of clusters = {len(np.unique(groups))}") return profile
if os.path.isfile(fn_anno): print('Load clusters from file') ds.samplesheet['community'] = pd.read_csv(fn_anno, sep='\t', index_col=0)['community'] else: print('Unsupervised clustering') import igraph as ig sys.path.insert(0, os.path.abspath('../../packages/')) import leidenalg G = ig.Graph(edges=edges) partition = partition = leidenalg.CPMVertexPartition( G, resolution_parameter=0.01, ) opt = leidenalg.Optimiser() opt.optimise_partition(partition) communities = partition.membership print('n. communities: {:}'.format(len(np.unique(communities)))) ds.samplesheet['community'] = communities print('Unsupervised clustering, rough') import igraph as ig sys.path.insert(0, os.path.abspath('../../packages/')) import leidenalg G = ig.Graph(edges=edges) partition = partition = leidenalg.CPMVertexPartition( G, resolution_parameter=0.002, ) opt = leidenalg.Optimiser()
def fit_transform(self, graphs): G=[] for graph in graphs: if type(graph) is ig.Graph: G.append(graph) elif issparse(graph): G.append(self._scipy_to_igraph(graph)) else: G.append(self._other_to_igraph(graph)) if self.verbose: for i in range(len(G)): print("View Graph {}: num_nodes: {}, num_edges: {}, directed: {}, num_components: {}, num_isolates: {}" .format(i, G[i].vcount(), G[i].ecount(), G[i].is_directed(), len(G[i].components(mode='WEAK').sizes()), G[i].components(mode='WEAK').sizes().count(1))) self.weights = [] self.resolutions =[] self.best_modularity =-np.inf self.best_clustering = None self.best_resolutions = None self.best_weights = None self.modularities =[] self.clusterings =[] self.final_iteration = 0 self.best_iteration = 0 weights = [1]*len(G) resolutions =[1]*len(G) for iterate in range(self.max_clusterings): partitions = [] for i in range(len(G)): partitions.append(la.RBConfigurationVertexPartition(G[i], resolution_parameter=resolutions[i])) optimiser = la.Optimiser() diff = optimiser.optimise_partition_multiplex(partitions, layer_weights = weights, n_iterations=self.n_iterations) self.clusterings.append(np.array(partitions[0].membership)) self.modularities.append([part.quality()/(part.graph.ecount() if part.graph.is_directed() else 2*part.graph.ecount()) for part in partitions]) self.weights.append(weights.copy()) self.resolutions.append(resolutions.copy()) self.final_iteration +=1 if self.verbose: print("--------") print("Iteration: {} \n Modularities: {} \n Resolutions: {} \n Weights: {}" .format(self.final_iteration, self.modularities[-1], resolutions, weights)) # if np.sum(np.array(self.weights[-1]) * np.array(self.modularities[-1])) > self.best_modularity: self.best_clustering = self.clusterings[-1] self.best_modularity = np.sum(np.array(self.weights[-1]) * np.array(self.modularities[-1])) self.best_resolutions = self.resolutions[-1] self.best_weights = self.weights[-1] self.best_iteration = self.final_iteration theta_in, theta_out = self._calculate_edge_probabilities(G) for i in range(len(G)): resolutions[i] = (theta_in[i] - theta_out[i])/ (np.log(theta_in[i]) - np.log(theta_out[i])) weights[i] = (np.log(theta_in[i]) - np.log(theta_out[i]))/(np.mean([np.log(theta_in[j]) - np.log(theta_out[j]) for j in range(len(G))])) if (np.all(np.abs(np.array(self.resolutions[-1])-np.array(resolutions)) <= self.resolution_tol) and np.all(np.abs(np.array(self.weights[-1])-np.array(weights)) <= self.resolution_tol)): break else: best_iteration = np.argmax([np.sum(np.array(self.weights[i]) * np.array(self.modularities[i])) for i in range(len(self.modularities))]) self.best_clustering = self.clusterings[best_iteration] self.best_modularity = np.sum(np.array(self.weights[best_iteration]) * np.array(self.modularities[best_iteration])) self.best_resolutions = self.resolutions[best_iteration] self.best_weights = self.weights[best_iteration] self.best_iteration = best_iteration if self.verbose: print("MVMC did not converge, best result found: Iteration: {}, Modularity: {}, Resolutions: {}, Weights: {}" .format(self.best_iteration, self.best_modularity, self.best_resolutions, self.best_weights)) return self.best_clustering