def identify_clusters(vlm, conn, correct_tags=False, tag_correction_list=[], method_name='ModularityVertexPartition', seed=360): """ Cluster identification via the Louvain algorithm. Can be used for cluster discovery. If clusters are manually identified (e.g. by visualize_protein_markers()), clusters can be renumbered or combined using the tag correction list. Method names are any used in louvain.find_partition method. """ g = ig.Graph.Adjacency(conn.todense().tolist()) method = getattr(louvain, method_name) louvain.set_rng_seed(seed) partition = louvain.find_partition(g, method) tag_list = np.zeros(conn.shape[0]) for x in range(len(partition)): tag_list[partition[x]] = int(x) if correct_tags: cluster_ID = [tag_correction_list[int(X)] for X in tag_list] else: cluster_ID = [int(X) for X in tag_list] num_clusters = max(cluster_ID) + 1 vlm.cluster_ID = cluster_ID vlm.num_clusters = int(num_clusters) return [cluster_ID, num_clusters]
def test_diff_move(): intraslice = ig.Graph.Read_Ncol("multilayer_SBM_interslice_edges.csv", directed=False) n = intraslice.vcount() layer_vec = [0] * n membership = list(range(n)) part_rbc = louvain.RBConfigurationVertexPartition( intraslice, resolution_parameter=1.0, initial_membership=membership) part_weighted_layers = louvain.RBConfigurationVertexPartitionWeightedLayers( intraslice, resolution_parameter=1.0, layer_vec=layer_vec, initial_membership=membership) # check diff_move() - quality() consistency across 100 random moves for repeat in range(100): v = randint(0, n - 1) c = randint(0, n - 1) old_quality = part_weighted_layers.quality() wl_diff = part_weighted_layers.diff_move(v, c) part_weighted_layers.move_node(v, c) true_diff = part_weighted_layers.quality() - old_quality rbc_diff = part_rbc.diff_move(v, c) part_rbc.move_node(v, c) assert isclose( wl_diff, true_diff ), "WeightedLayers diff_move() inconsistent with quality()" assert isclose( wl_diff, rbc_diff ), "WeightedLayers diff_move() inconsistent with single-layer" assert isclose(part_weighted_layers.quality(), part_rbc.quality( )), "WeightedLayers quality() inconsistent with single-layer" # check rng consistency between RBConfigurationVertexPartition and its WeightedLayers variant # with various seeds and intraslice resolution parameters for gamma in np.linspace(0.5, 1.5, 10): shared_seed = randint(-1 << 31, (1 << 31) - 1) # random int32 louvain.set_rng_seed(shared_seed) part_weighted_layers = louvain.RBConfigurationVertexPartitionWeightedLayers( intraslice, resolution_parameter=gamma, layer_vec=layer_vec) opt = louvain.Optimiser() opt.optimise_partition(partition=part_weighted_layers) louvain.set_rng_seed(shared_seed) part_rbc = louvain.RBConfigurationVertexPartition( intraslice, resolution_parameter=gamma) opt = louvain.Optimiser() opt.optimise_partition(partition=part_rbc) quality_weighted_layers = part_weighted_layers.quality( resolution_parameter=gamma) quality_rbc = part_rbc.quality(resolution_parameter=gamma) assert isclose( quality_weighted_layers, quality_rbc ), "Intra-layer optimisation inconsistent with single-layer"
def louvain(i, j, val, dim, partition_method, initial_membership, weights, resolution, node_sizes, seed, verbose): import louvain import igraph as ig import numpy from scipy.sparse import csc_matrix data = csc_matrix((val, (i, j)), shape=dim) # vcount = max(data.shape) sources, targets = data.nonzero() edgelist = zip(sources.tolist(), targets.tolist()) G = ig.Graph(edges=list(edgelist)) # G = ig.Graph.Adjacency(data.tolist()) if partition_method == 'ModularityVertexPartition': partition = louvain.CPMVertexPartition( G, initial_membership=initial_membership, weights=weights) elif partition_method == 'RBConfigurationVertexPartition': partition = louvain.CPMVertexPartition( G, initial_membership=initial_membership, weights=weights, resolution_parameter=resolution) elif partition_method == 'RBERVertexPartition': partition = louvain.CPMVertexPartition( G, initial_membership=initial_membership, weights=weights, node_sizes=node_sizes, resolution_parameter=resolution) elif partition_method == 'CPMVertexPartition': partition = louvain.CPMVertexPartition( G, initial_membership=initial_membership, weights=weights, node_sizes=node_sizes, resolution_parameter=resolution) elif partition_method == 'SignificanceVertexPartition': partition = louvain.CPMVertexPartition( G, initial_membership=initial_membership, node_sizes=node_sizes) elif partition_method == 'SurpriseVertexPartition': partition = louvain.CPMVertexPartition( G, initial_membership=initial_membership, weights=weights, node_sizes=node_sizes) else: raise ValueError('partition_method ' + partition_method + ' is NOT supported.') if seed != None: louvain.set_rng_seed(seed) optimiser = louvain.Optimiser() diff = optimiser.optimise_partition(partition) # ig.plot(partition) return partition
def louvain_method(user_interaction_graph): ''' https://github.com/vtraag/louvain-igraph Fast unfolding of communities in large networks, Vincent D Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Renaud Lefebvre, Journal of Statistical Mechanics: Theory and Experiment 2008(10), P10008 (12pp) :param user_interaction_graph: igraph Graph ''' louvain.set_rng_seed(43) node_names = user_interaction_graph.vs return[[node_names[node]['name'] for node in community] for community in louvain.find_partition(user_interaction_graph, louvain.ModularityVertexPartition)]
def get_louvain(mknn, min_cluster_size=10, resolution_parameter=1.0, seed=0): g = ig.Graph(n=mknn.shape[0], edges=list(zip(mknn.row, mknn.col)), directed=False) # Louvain clustering over the mKNN graph louvain.set_rng_seed(seed) part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution_parameter) return CellLabels(clean_labels(part.membership, min_cluster_size=min_cluster_size))
def optimal_modularity_community_detection(self,visual=True,name='optimal_modularity'): """ Community detection Function using Louvain algorithm and maximization of modularity. Inputs: - visual: (Default = True) Visualize the communities computed - name: name of the .png exported file """ louvain.set_rng_seed(123456) partition = louvain.find_partition(self.G, louvain.ModularityVertexPartition,weights=self.G.es['weight']) self.G.vs['community_optimal_modularity'] = partition.membership print("The estimated number of communities is",len(set(partition.membership))) print('\n') print("Communities") for n in range(0,len(partition)): print('Community number', n, '- size:', len(partition[n])) #Create a dictionary whith keys as channels (names of our nodes) and values the community they belong comm_detect = dict(zip(self.G.vs['label'],self.G.vs['community_optimal_modularity'])) print() print('The communities are:') print() comms = {} for item in comm_detect.items(): if item[1] not in comms.keys(): comms[item[1]] = [] comms[item[1]].append(item[0]) comms = OrderedDict(sorted(comms.items(), key=lambda t:t[0])) print(comms.items()) if visual: visual_style = {} visual_style["vertex_size"] = 25 #visual_style["vertex_color"] = "white" visual_style["vertex_label"] = self.G.vs["label"] #visual_style["edge_width"] = [math.exp(weight)*0.5 for weight in self.G.es["weight"]] visual_style["edge_width"] = 0.2 visual_style["layout"] = self.G.vs["coords"] pal = igraph.drawing.colors.ClusterColoringPalette(len(set(self.G.vs['community_optimal_modularity']))) visual_style["vertex_color"] = pal.get_many(self.G.vs['community_optimal_modularity']) self.G.es['arrow_size'] = [0.1 for edge in self.G.es] graph = igraph.plot(self.G,bbox=(0, 0, 600, 600), **visual_style) graph.save(name + '.png') return(comms,graph) return(comms)
def louvain_clusters(latent, k=10, rands=0, mutual=False): nn_matrix = kneighbors_graph(latent, k) rows, cols = nn_matrix.nonzero() if mutual == True: edges = [[row, col] if row < col else (col, row) for row, col in zip(rows, cols)] edges = np.asarray(edges) unique_edges, edges_count = np.unique(edges, return_counts=True, axis=0) edges = unique_edges[edges_count == 2] else: edges = [(row, col) for row, col in zip(rows, cols)] g = ig.Graph() g.add_vertices(latent.shape[0]) g.add_edges(edges) louvain.set_rng_seed(rands) res = louvain.find_partition(g, louvain.ModularityVertexPartition) clusters = np.asarray(res.membership) return clusters
def louvain( adata: AnnData, resolution: Optional[float] = None, random_state: _utils.AnyRandom = 0, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, key_added: str = 'louvain', adjacency: Optional[spmatrix] = None, flavor: Literal['vtraag', 'igraph', 'rapids'] = 'vtraag', directed: bool = True, use_weights: bool = False, partition_type: Optional[Type[MutableVertexPartition]] = None, partition_kwargs: Mapping[str, Any] = MappingProxyType({}), neighbors_key: Optional[str] = None, obsp: Optional[str] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first, or explicitly passing a ``adjacency`` matrix. Parameters ---------- adata The annotated data matrix. resolution For the default flavor (``'vtraag'``) or for ```RAPIDS```, you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain ``(obs_key, list_of_categories)``. key_added Key under which to add the cluster labels. (default: ``'louvain'``) adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. flavor Choose between to packages for computing the clustering. ``'vtraag'`` is much more powerful, and the default. directed Interpret the ``adjacency`` matrix as directed graph? use_weights Use weights from knn graph. partition_type Type of partition to use. Only a valid argument if ``flavor`` is ``'vtraag'``. partition_kwargs Key word arguments to pass to partitioning, if ``vtraag`` method is being used. neighbors_key Use neighbors connectivities as adjacency. If not specified, louvain looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, louvain looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. obsp Use .obsp[obsp] as adjacency. You can't specify both `obsp` and `neighbors_key` at the same time. copy Copy adata or modify it inplace. Returns ------- :obj:`None` By default (``copy=False``), updates ``adata`` with the following fields: ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``) Array of dim (number of samples) that stores the subgroup id (``'0'``, ``'1'``, ...) for each cell. :class:`~anndata.AnnData` When ``copy=True`` is set, a copy of ``adata`` with those fields is returned. """ partition_kwargs = dict(partition_kwargs) start = logg.info('running Louvain clustering') if (flavor != 'vtraag') and (partition_type is not None): raise ValueError('`partition_type` is only a valid argument ' 'when `flavour` is "vtraag"') adata = adata.copy() if copy else adata if adjacency is None: adjacency = _choose_graph(adata, obsp, neighbors_key) if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warning( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.debug(' using the undirected graph') g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) if use_weights: weights = np.array(g.es["weight"]).astype(np.float64) else: weights = None if flavor == 'vtraag': import louvain if partition_type is None: partition_type = louvain.RBConfigurationVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights if version.parse(louvain.__version__) < version.parse("0.7.0"): louvain.set_rng_seed(random_state) else: partition_kwargs["seed"] = random_state logg.info(' using the "louvain" package of Traag (2017)') part = louvain.find_partition( g, partition_type, **partition_kwargs, ) # adata.uns['louvain_quality'] = part.quality() else: part = g.community_multilevel(weights=weights) groups = np.array(part.membership) elif flavor == 'rapids': # nvLouvain only works with undirected graphs, # and `adjacency` must have a directed edge in both directions import cudf import cugraph offsets = cudf.Series(adjacency.indptr) indices = cudf.Series(adjacency.indices) if use_weights: sources, targets = adjacency.nonzero() weights = adjacency[sources, targets] if isinstance(weights, np.matrix): weights = weights.A1 weights = cudf.Series(weights) else: weights = None g = cugraph.Graph() if hasattr(g, 'add_adj_list'): g.add_adj_list(offsets, indices, weights) else: g.from_cudf_adjlist(offsets, indices, weights) logg.info(' using the "louvain" package of rapids') if resolution is not None: louvain_parts, _ = cugraph.louvain(g, resolution=resolution) else: louvain_parts, _ = cugraph.louvain(g) groups = (louvain_parts.to_pandas().sort_values('vertex')[[ 'partition' ]].to_numpy().ravel()) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') if restrict_to is not None: if key_added == 'louvain': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(map(str, np.unique(groups))), ) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = dict( resolution=resolution, random_state=random_state, ) logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adata if copy else None
def louvain(adata, resolution=None, random_state=0, restrict_to=None, key_added=None, adjacency=None, flavor='vtraag', directed=True, copy=False): """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires to run :func:`~scanpy.api.pp.neighbors`, first. Parameters ---------- adata : :class:`~scanpy.api.AnnData` The annotated data matrix. resolution : `float` or `None`, optional (default: 1) For the default flavor ('vtraag'), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state : `int`, optional (default: 0) Change the initialization of the optimization. restrict_to : `tuple`, optional (default: None) Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain (obs key, list of categories). key_added : `str`, optional (default: 'louvain') Key under which to add the cluster labels. adjacency : sparse matrix or `None`, optional (default: `None`) Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']`. flavor : {'vtraag', 'igraph'} Choose between to packages for computing the clustering. 'vtraag' is much more powerful. copy : `bool` (default: `False`) Copy adata or modify it inplace. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. louvain : `pd.Series` (``adata.obs``, dtype `category`) Array of dim (number of samples) that stores the subgroup id ('0', '1', ...) for each cell. """ logg.info('running Louvain clustering', r=True) adata = adata.copy() if copy else adata if adjacency is None and 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first to compute a neighborhood graph.' ) if adjacency is None: adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to if not isinstance(restrict_categories[0], str): raise ValueError('You need to use strings to label categories, ' 'e.g. \'1\' instead of 1.') for c in restrict_categories: if c not in adata.obs[restrict_key].cat.categories: raise ValueError( '\'{}\' is not a valid category for \'{}\''.format( c, restrict_key)) restrict_indices = adata.obs[restrict_key].isin( restrict_categories).values adjacency = adjacency[restrict_indices, :] adjacency = adjacency[:, restrict_indices] if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warn( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.m(' using the undirected graph', v=4) g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if flavor == 'vtraag': import louvain if resolution is None: resolution = 1 try: logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution) # adata.uns['louvain_quality'] = part.quality() except AttributeError: logg.warn('Did not find package louvain>=0.6, ' 'the clustering result will therefore not ' 'be 100% reproducible, ' 'but still meaningful. ' 'If you want 100% reproducible results, ' 'update via "pip install louvain --upgrade".') part = louvain.find_partition(g, method='RBConfiguration', resolution_parameter=resolution) elif flavor == 'igraph': part = g.community_multilevel() groups = np.array(part.membership) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') unique_groups = np.unique(groups) n_clusters = len(unique_groups) if restrict_to is None: groups = groups.astype('U') key_added = 'louvain' if key_added is None else key_added adata.obs[key_added] = pd.Categorical(values=groups, categories=natsorted( unique_groups.astype('U'))) else: key_added = restrict_key + '_R' if key_added is None else key_added all_groups = adata.obs[restrict_key].astype('U') prefix = '-'.join(restrict_categories) + ',' new_groups = [prefix + g for g in groups.astype('U')] all_groups.iloc[restrict_indices] = new_groups adata.obs[key_added] = pd.Categorical(values=all_groups, categories=natsorted( all_groups.unique())) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = { 'resolution': resolution, 'random_state': random_state } logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint('found {} clusters and added\n' ' \'{}\', the cluster labels (adata.obs, categorical)'.format( n_clusters, key_added)) return adata if copy else None
def getNclusters2( adata, G, n_clusters, seed, cluster_func, flavor, weights, range_min=0, range_max=3, max_steps=20, ): this_step = 0 this_min = float(range_min) this_max = float(range_max) weighted = weights is None while this_step < max_steps: # print('step ' + str(this_step)) this_resolution = this_min + ((this_max - this_min) / 2) if cluster_func == "louvain": if flavor == "scanpy": sc.tl.louvain( adata, resolution=this_resolution, random_state=seed, use_weights=weighted, ) clus = np.array(adata.obs["louvain"]).astype(int) this_clusters = adata.obs["louvain"].nunique() elif flavor == "base": louvain.set_rng_seed(seed) part_louvain = louvain.find_partition( graph=G, partition_type=louvain.RBConfigurationVertexPartition, weights=weights, resolution_parameter=this_resolution, ) clus = np.array(part_louvain.membership) this_clusters = len(np.unique(clus)) elif cluster_func == "leiden": if flavor == "scanpy": sc.tl.leiden( adata, resolution=this_resolution, random_state=seed, use_weights=weighted, ) clus = np.array(adata.obs["leiden"]).astype(int) this_clusters = adata.obs["leiden"].nunique() elif flavor == "base": part_leiden = leidenalg.find_partition( graph=G, partition_type=leidenalg.RBConfigurationVertexPartition, weights=weights, resolution_parameter=this_resolution, seed=seed, ) clus = np.array(part_leiden.membership) this_clusters = len(np.unique(clus)) else: raise ValueError( "incorrect cluster_func, choose 'leiden' or 'louvain'") # print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution)) if this_clusters > n_clusters: this_max = this_resolution elif this_clusters < n_clusters: this_min = this_resolution else: return clus, dict(resolution=this_resolution, succeeded=True) this_step += 1 print("Cannot find the number of clusters") print("Clustering solution from last iteration is used:" + str(this_clusters) + " at resolution " + str(this_resolution)) return clus, dict(resolution=this_resolution, succeeded=False)
def clustering_analysis( adata, true_labels, do_norm, norm_scale, do_log, do_pca, n_neighbors, n_clusters, metric, weighted, # weighted adjmat for louvain/leiden clustering ? seed, n_comps=50, hubness_methods={ "mp_normal": ("mp", { "method": "normal" }), "ls": ("ls", None), "ls_nicdm": ("ls", { "method": "nicdm" }), "dsl": ("dsl", None), }, retained_cells_idx=None, ): results_dict = {} results_dict["params"] = dict( do_norm=do_norm, norm_scale=norm_scale, do_log=do_log, do_pca=do_pca, n_neighbors=n_neighbors, n_clusters=n_clusters, metric=metric, weighted=weighted, seed=seed, n_comps=n_comps, ) start = time.time() ### preprocess, prepare clustering input ### if retained_cells_idx is None: retained_cells_idx = range(len(adata.X)) if type(do_norm) is str: adata.X = scipy.sparse.csr_matrix(adata.X) if do_norm == "seurat": recipe_seurat(adata, do_log, norm_scale) print(f"\t\tseurat norm retained {adata.X.shape[1]} genes") elif do_norm == "zheng17": recipe_zheng17(adata, do_log, norm_scale, n_top_genes=5000) print(f"\t\tzheng norm retained {adata.X.shape[1]} genes") elif do_norm == "duo": recipe_duo(adata, do_log, renorm=norm_scale) print(f"\t\tduo norm retained {adata.X.shape[1]} genes") else: raise ValueError("do_norm not in 'duo', seurat', 'zheng17'") if scipy.sparse.issparse(adata.X): adata.X = adata.X.toarray() if do_log and not (type(do_norm) is str): print("\t\tlog_transformed data") sc.pp.log1p(adata) if do_pca: use_rep = "X_pca" sc.pp.pca(adata, n_comps=min(adata.X.shape[1] - 1, min(len(adata.X) - 1, n_comps))) X = adata.obsm["X_pca"] res_key = results_dict["X_pca"] = {} else: # already computed pca use_rep = "X_pca" X = adata.obsm["X_pca"] res_key = results_dict["X_pca"] = {} print("\t\t\tPreprocessing done:", round((time.time() - start) / 60, 2), "mn") start = time.time() adjmat, affinity_matrix, G, weights, scores = generate_clustering_inputs2( X, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=None, hubness_params=None, ) print("\t\t\tInputs generated:", round((time.time() - start) / 60, 2), "mn") start = time.time() res_key["hubness_df"] = pd.DataFrame.from_dict(data=scores, orient="index", columns=["base"]) # import pdb;pdb.set_trace() ### base and scanpy clustering ### # main dictionaries res_key["clus"] = {} res_key["clus_info"] = {} res_key["clus_scores"] = {} # sub dictionaries clus_methods_keys = [ "scanpy_default_umap", "scanpy_default_gauss", "base_default", "scanpy_umap", "scanpy_gauss", "base", ] for k in clus_methods_keys: res_key["clus"][k] = {} res_key["clus_info"][k] = {} # cluster with default params # scanpy for method in ["umap", "gauss"]: # compute neighbors try: sc.pp.neighbors( adata, n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=method, ) except: sc.pp.neighbors( adata, n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=method, knn=False, ) # cluster sc.tl.louvain(adata, resolution=1.0, random_state=seed, use_weights=weighted) res_key["clus"]["scanpy_default_" + method]["louvain"] = np.array( adata.obs["louvain"]).astype(int) res_key["clus_info"]["scanpy_default_" + method]["louvain"] = dict( n_clusters=len( np.unique(np.array(adata.obs["louvain"]).astype(int)))) sc.tl.leiden(adata, resolution=1.0, random_state=seed, use_weights=weighted) res_key["clus"]["scanpy_default_" + method]["leiden"] = np.array( adata.obs["leiden"]).astype(int) res_key["clus_info"]["scanpy_default_" + method]["leiden"] = dict( n_clusters=len(np.unique( np.array(adata.obs["leiden"]).astype(int)))) print("\t\t\tScanpy louvain/leiden clus:", round((time.time() - start) / 60, 2), "mn") start = time.time() # base louvain.set_rng_seed(seed) part_louvain = louvain.find_partition( graph=G, partition_type=louvain.RBConfigurationVertexPartition, weights=weights, resolution_parameter=1.0, ) res_key["clus"]["base_default"]["louvain"] = np.array( part_louvain.membership) res_key["clus_info"]["base_default"]["louvain"] = dict( n_clusters=len(np.unique(np.array(part_louvain.membership)))) part_leiden = leidenalg.find_partition( graph=G, partition_type=leidenalg.RBConfigurationVertexPartition, weights=weights, resolution_parameter=1.0, seed=seed, ) res_key["clus"]["base_default"]["leiden"] = np.array( part_leiden.membership) res_key["clus_info"]["base_default"]["leiden"] = dict( n_clusters=len(np.unique(np.array(part_leiden.membership)))) print("\t\t\tLouvain/leiden clus:", round((time.time() - start) / 60, 2), "mn") start = time.time() # cluster with ground truth number of clusters # scanpy for method in ["umap", "gauss"]: # compute neighbors try: sc.pp.neighbors( adata, n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=method, ) except: sc.pp.neighbors( adata, n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=method, knn=False, ) # cluster ( res_key["clus"]["scanpy_" + method]["louvain"], res_key["clus_info"]["scanpy_" + method]["louvain"], ) = getNclusters2( adata, G, n_clusters=n_clusters, seed=seed, cluster_func="louvain", flavor="scanpy", weights=weights, ) ( res_key["clus"]["scanpy_" + method]["leiden"], res_key["clus_info"]["scanpy_" + method]["leiden"], ) = getNclusters2( adata, G, n_clusters=n_clusters, seed=seed, cluster_func="leiden", flavor="scanpy", weights=weights, ) print( "\t\t\tScanpy louvain/leiden clus, searching ground truth:", round((time.time() - start) / 60, 2), "mn", ) start = time.time() ( res_key["clus"]["base"]["louvain"], res_key["clus_info"]["base"]["louvain"], ) = getNclusters2( adata, G, n_clusters=n_clusters, seed=seed, cluster_func="louvain", flavor="base", weights=weights, ) ( res_key["clus"]["base"]["leiden"], res_key["clus_info"]["base"]["leiden"], ) = getNclusters2( adata, G, n_clusters=n_clusters, seed=seed, cluster_func="leiden", flavor="base", weights=weights, ) print( "\t\t\tBase louvain/leiden clus, searching ground truth:", round((time.time() - start) / 60, 2), "mn", ) start = time.time() # store scores, info for k in clus_methods_keys: res_key["clus_scores"][k] = get_scores(true_labels, res_key["clus"][k], retained_cells_idx, X, metric, seed) res_key["clus_info"][k] = pd.DataFrame(res_key["clus_info"][k]) print("\t\t\tScoring:", round((time.time() - start) / 60, 2), "mn") start = time.time() del adjmat, affinity_matrix, G, weights, scores ### hubness-aware clustering ### for method_name, (hubness, hubness_params) in hubness_methods.items(): res_key["clus"][method_name] = {} res_key["clus"][method_name + "_default"] = {} res_key["clus_info"][method_name] = {} res_key["clus_info"][method_name + "_default"] = {} ( hub_adjmat, hub_affinity_matrix, hub_G, hub_weights, hub_scores, ) = generate_clustering_inputs( X, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=hubness, hubness_params=hubness_params, ) # store hubness information res_key["hubness_df"] = pd.concat( ( res_key["hubness_df"], pd.DataFrame.from_dict( data=hub_scores, orient="index", columns=[method_name]), ), axis=1, ) # cluster with default params louvain.set_rng_seed(seed) part_louvain = louvain.find_partition( graph=hub_G, partition_type=louvain.RBConfigurationVertexPartition, weights=hub_weights, resolution_parameter=1.0, ) res_key["clus"][method_name + "_default"]["louvain"] = np.array( part_louvain.membership) res_key["clus_info"][method_name + "_default"]["louvain"] = dict( n_clusters=len(np.unique(np.array(part_louvain.membership)))) part_leiden = leidenalg.find_partition( graph=hub_G, partition_type=leidenalg.RBConfigurationVertexPartition, weights=hub_weights, resolution_parameter=1.0, seed=seed, ) res_key["clus"][method_name + "_default"]["leiden"] = np.array( part_leiden.membership) res_key["clus_info"][method_name + "_default"]["leiden"] = dict( n_clusters=len(np.unique(np.array(part_leiden.membership)))) # cluster with ground truth number of clusters ( res_key["clus"][method_name]["louvain"], res_key["clus_info"][method_name]["louvain"], ) = getNclusters2( adata, hub_G, n_clusters=n_clusters, seed=seed, cluster_func="louvain", flavor="base", weights=hub_weights, ) ( res_key["clus"][method_name]["leiden"], res_key["clus_info"][method_name]["leiden"], ) = getNclusters2( adata, hub_G, n_clusters=n_clusters, seed=seed, cluster_func="leiden", flavor="base", weights=hub_weights, ) # store clustering scores, info res_key["clus_scores"][method_name + "_default"] = get_scores( true_labels, res_key["clus"][method_name + "_default"], retained_cells_idx, X, metric, seed, ) res_key["clus_scores"][method_name] = get_scores( true_labels, res_key["clus"][method_name], retained_cells_idx, X, metric, seed, ) res_key["clus_info"][method_name + "_default"] = pd.DataFrame( res_key["clus_info"][method_name + "_default"]) res_key["clus_info"][method_name] = pd.DataFrame( res_key["clus_info"][method_name]) print( "\t\t\tHubness methods full pipeline:", round((time.time() - start) / 60, 2), "mn", ) return results_dict
def run_louvain(graph, config_model='Default', overlap=False, directed=False, deep=False, interslice_weight=0.1, resolution_parameter=0.1, seed=None): """ :outdir: the output directory to comprehend the output link file :param graph: input file :param config_model: 'RB', 'RBER', 'CPM', 'Surprise', 'Significance' :param overlap: bool, whether to enable overlapping community detection :param directed :param deep :param interslice_weight :param resolution_parameter :return """ if seed != None: louvain.set_rng_seed(seed) def louvain_hierarchy_output(partition): optimiser = louvain.Optimiser() partition_agg = partition.aggregate_partition() partition_layers = [] while optimiser.move_nodes(partition_agg) > 0: partition.from_coarse_partition(partition_agg) partition_agg = partition_agg.aggregate_partition() partition_layers.append(list(partition)) return partition_layers def louvain_multiplex(graphs, partition_type, interslice_weight, resolution_parameter): layers, interslice_layer, G_full = louvain.time_slices_to_layers( graphs, vertex_id_attr='name', interslice_weight=interslice_weight) if partition_type == louvain.ModularityVertexPartition: partitions = [partition_type(H) for H in layers] interslice_partition = partition_type(interslice_layer, weights='weight') else: partitions = [ partition_type(H, resolution_parameter=resolution_parameter) for H in layers ] interslice_partition = partition_type( interslice_layer, resolution_parameter=resolution_parameter, weights='weight') optimiser = louvain.Optimiser() optimiser.optimise_partition_multiplex(partitions + [interslice_partition]) quality = sum( [p.quality() for p in partitions + [interslice_partition]]) return partitions[0], quality def partition_to_clust(graphs, partition, min_size_cut=2): clusts = [] node_names = [] if not isinstance(graphs, list): graphs = [graphs] for g in graphs: node_names.extend(g.vs['name']) for i in range(len(partition)): clust = [node_names[id] for id in partition[i]] clust = list(set(clust)) if len(clust) < min_size_cut: continue clust.sort() clusts.append(clust) clusts = sorted(clusts, key=lambda x: len(x), reverse=True) return clusts multi = False if isinstance(graph, list): multi = True if overlap == True and multi == False: multi = True net = graph graph = [] for i in range(4): graph.append(net) if multi == True and deep == True: sys.stderr.write( 'louvain does not support hierarchical clustering with overlapped communities' ) sys.exit() if config_model == 'RB': partition_type = louvain.RBConfigurationVertexPartition elif config_model == 'RBER': partition_type = louvain.RBERConfigurationVertexPartition elif config_model == 'CPM': partition_type = louvain.CPMVertexPartition elif config_model == 'Surprise': partition_type = louvain.SurpriseVertexPartition elif config_model == "Significance": partition_type = louvain.SignificanceVertexPartition else: sys.stderr.write("Not specifying the configuration model; " "perform simple Louvain.") partition_type = louvain.ModularityVertexPartition weighted = False if multi: wL = [] G = [] for file in graph: with open(file, 'r') as f: lines = f.read().splitlines() elts = lines[0].split() if len(elts) == 3: weighted = True else: weighted = False for i in range(len(lines)): elts = lines[i].split() for j in range(2): elts[j] = int(elts[j]) if weighted == True: elts[2] = float(elts[2]) if elts[2] < 0: sys.stderr.write("negative edge weight not allowed") return 1 lines[i] = tuple(elts) g = igraph.Graph.TupleList(lines, directed=directed, weights=weighted) G.append(g) wL.append(weighted) f.close() if True in wL and False in wL: raise Exception('all graphs should follow the same format') if partition_type == louvain.CPMVertexPartition and directed is True: raise Exception('graph for CPMVertexPartition must be undirected') if partition_type == louvain.SignificanceVertexPartition and weighted is True: raise Exception('SignificanceVertexPartition only support ' 'unweighted graphs') partition, quality = louvain_multiplex(G, partition_type, interslice_weight, resolution_parameter) else: with open(graph, 'r') as f: lines = f.read().splitlines() elts = lines[0].split() if len(elts) == 3: weighted = True else: weighted = False for i in range(len(lines)): elts = lines[i].split() for j in range(2): elts[j] = int(elts[j]) if weighted is True: elts[2] = float(elts[2]) if elts[2] < 0: sys.stderr.write("negative edge weight not allowed") return 1 lines[i] = tuple(elts) f.close() G = igraph.Graph.TupleList(lines, directed=directed, weights=weighted) if weighted is False: weights = None else: weights = G.es['weight'] if partition_type == louvain.ModularityVertexPartition: partition = partition_type(G, weights=weights) else: partition = partition_type( G, weights=weights, resolution_parameter=resolution_parameter) if deep == False: optimiser = louvain.Optimiser() optimiser.optimise_partition(partition) if deep == False: clusts = partition_to_clust(G, partition) if len(clusts) == 0: sys.stderr.write( "No cluster; Resolution parameter may be too extreme") return 1 maxNode = 0 for clust in clusts: maxNode = max(maxNode, max(clust)) for i in range(len(clusts)): sys.stdout.write( str(maxNode + len(partition) + 1) + ',' + str(maxNode + i + 1) + ',' + 'c-c' + ';') for n in clusts[i]: sys.stdout.write( str(maxNode + i + 1) + ',' + str(n) + ',' + 'c-m' + ';') else: partitions = louvain_hierarchy_output(partition) clusts_layers = [] for p in partitions: clusts_layers.append(partition_to_clust(G, p)) if len(clusts_layers[0]) == 0: sys.stderr.write( "No cluster; Resolution parameter may be too extreme") return 1 maxNode = 0 for clust in clusts_layers[0]: maxNode = max(maxNode, max(clust)) for i in range(len(clusts_layers[0])): for n in clusts_layers[0][i]: sys.stdout.write( str(maxNode + i + 1) + ',' + str(n) + ',' + 'c-m' + ';') maxNode = maxNode + len(clusts_layers[0]) for i in range(1, len(clusts_layers)): for j in range(len(clusts_layers[i - 1])): for k in range(len(clusts_layers[i])): if all(x in clusts_layers[i][k] for x in clusts_layers[i - 1][j]): sys.stdout.write( str(maxNode + k + 1) + ',' + str(maxNode - len(clusts_layers[i - 1]) + j + 1) + ',' + 'c-c' + ';') break maxNode = maxNode + len(clusts_layers[i]) for i in range(len(clusts_layers[-1])): sys.stdout.write( str(maxNode + 1) + ',' + str(maxNode - len(clusts_layers[-1]) + i + 1) + ',' + 'c-c' + ';') sys.stdout.flush() return 0
def louvain( adata: AnnData, resolution: Optional[float] = None, random_state: Optional[Union[int, RandomState]] = 0, log_fname: str = '', restrict_to: Optional[Tuple[str, Sequence[str]]] = None, key_added: Optional[str] = 'louvain', adjacency: Optional[spmatrix] = None, flavor: str = 'vtraag', directed: bool = True, use_weights: bool = False, partition_type: Optional[Type[MutableVertexPartition]] = None, partition_kwargs: Optional[Mapping[str, Any]] = None, copy: bool = False, ) -> Optional[AnnData]: """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first, or explicitly passing a ``adjacency`` matrix. Parameters ---------- adata The annotated data matrix. resolution For the default flavor (``'vtraag'``), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain ``(obs_key, list_of_categories)``. key_added Key under which to add the cluster labels. (default: ``'louvain'``) adjacency Sparse adjacency matrix of the graph, defaults to ``adata.uns['neighbors']['connectivities']``. flavor : {``'vtraag'``, ``'igraph'``} Choose between to packages for computing the clustering. ``'vtraag'`` is much more powerful, and the default. directed Interpret the ``adjacency`` matrix as directed graph? use_weights Use weights from knn graph. partition_type Type of partition to use. Only a valid argument if ``flavor`` is ``'vtraag'``. partition_kwargs Key word arguments to pass to partitioning, if ``vtraag`` method is being used. copy Copy adata or modify it inplace. Returns ------- :obj:`None` By default (``copy=False``), updates ``adata`` with the following fields: ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``) Array of dim (number of samples) that stores the subgroup id (``'0'``, ``'1'``, ...) for each cell. :class:`~anndata.AnnData` When ``copy=True`` is set, a copy of ``adata`` with those fields is returned. """ start = logg.info('running Louvain clustering') if (flavor != 'vtraag') and (partition_type is not None): raise ValueError( '`partition_type` is only a valid argument when `flavour` is "vtraag"' ) adata = adata.copy() if copy else adata if adjacency is None and 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first to compute a neighborhood graph.' ) if adjacency is None: adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency) if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warning( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.debug(' using the undirected graph') g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if use_weights: weights = np.array(g.es["weight"]).astype(np.float64) else: weights = None if flavor == 'vtraag': import louvain if partition_kwargs is None: partition_kwargs = {} if partition_type is None: partition_type = louvain.RBConfigurationVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, partition_type, log_fname=log_fname, **partition_kwargs, ) # adata.uns['louvain_quality'] = part.quality() else: part = g.community_multilevel(weights=weights) groups = np.array(part.membership) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') if restrict_to is not None: if key_added == 'louvain': key_added += '_R' groups = rename_groups(adata, key_added, restrict_key, restrict_categories, restrict_indices, groups) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(np.unique(groups).astype('U')), ) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = { 'resolution': resolution, 'random_state': random_state } logg.info( ' finished', time=start, #deep=( # f'found {len(np.unique(groups))} clusters and added\n' # f' {key_added!r}, the cluster labels (adata.obs, categorical)' #), ) return adata if copy else None
def temporal_louvain(net, iter_n=1, resolution_parameter=1, interslice_weight=1, quality_function='NewmanGirvan2004', seed=100): """ Temporal louvain clustering run for iter_n times. Parameters ---------- net : array, dict network representation (contact sequences or graphlet) iter_n : int nummber of repeated louvain clustering resolution_parameter : int Spatial resolution parameter. Only valid for some qualtiy functions. Default=1. Resolution parameter is only needed for ReichardtBornholdt2006, and interslice_weight : int The weight that connects the different graphlets/snapshots to eachother. Default=0 quality function : str What type of louvain clustering is done. Options: NewmanGirvan2004, TraagVanDoorenNesterov2011, ReichardtBornholdt2006 seed : int Seed for reproduceability consensus_threshold : float Value between 0 and 1. When creating consensus matrix, ignore if value only occurs in specified fraction of iterations. If 0.5 two nodes must be in the same community 50% of the time to be considered in the consensus matrix. Returns ------- communities : array Louvain clustering. Dimensions: [iter_n], commiunities, time Qualtify funciton sources -------------------------- NewmanGirvan2004 : Newman, M. E. J., & Girvan, M. (2004). Finding and evaluating community structure in networks. Physical Review E, 69(2), 026113. 10.1103/PhysRevE.69.026113 `Read more <http://louvain-igraph.readthedocs.io/en/latest/reference.html#modularityvertexpartition>`_ ReichardtBornholdt2006 : Reichardt, J., & Bornholdt, S. (2006). Statistical mechanics of community detection. Physical Review E, 74(1), 016110. 10.1103/PhysRevE.74.016110 `Read more <http://louvain-igraph.readthedocs.io/en/latest/reference.html#rbconfigurationvertexpartition>`_ TraagVanDoorenNesterov2011 : Traag, V. A., Van Dooren, P., & Nesterov, Y. (2011). Narrow scope for resolution-limit-free community detection. Physical Review E, 84(1), 016114. 10.1103/PhysRevE.84.016114 `Read more <http://louvain-igraph.readthedocs.io/en/latest/reference.html#cpmvertexpartition>`_ TraagKringsVanDooren2013 : Traag, V. A., Krings, G., & Van Dooren, P. (2013). Significant scales in community structure. Scientific Reports, 3, 2930. 10.1038/srep02930 `Read more <http://louvain-igraph.readthedocs.io/en/latest/reference.html#significancevertexpartition>`_ TraagAldecoaDelvenne2015 : Traag, V. A., Aldecoa, R., & Delvenne, J.-C. (2015). Detecting communities using asymptotical surprise. Physical Review E, 92(2), 022816. 10.1103/PhysRevE.92.022816 `Read more <http://louvain-igraph.readthedocs.io/en/latest/reference.html#surprisevertexpartition>`_ Dependencies ------------ These functions make use of iGraph (http://igraph.org/python/) and louvain-igraph (http://louvain-igraph.readthedocs.io/en/latest/) Note ---- At the moment input should generally only be positive edges. """ if isinstance(net, dict): dict_input = True else: dict_input = False if teneto.utils.checkInput(net, conMat=1) != 'M': net, netinfo = teneto.utils.process_input(net, ['C', 'G', 'TO']) if quality_function == 'TraagVanDoorenNesterov2011': louvain_alg = louvain.CPMVertexPartition louvain_kwags = {'resolution_parameter': resolution_parameter} elif quality_function == 'ReichardtBornholdt2006': louvain_alg = louvain.RBConfigurationVertexPartition louvain_kwags = {'resolution_parameter': resolution_parameter} elif quality_function == 'NewmanGirvan2004': louvain_alg = louvain.ModularityVertexPartition louvain_kwags = {} elif quality_function == 'TraagKringsVanDooren2013': louvain_alg = louvain.SignificanceVertexPartition louvain_kwags = {} elif quality_function == 'TraagAldecoaDelvenne2015': louvain_alg = louvain.SurpriseVertexPartition louvain_kwags = {} g_to_ig = [] if len(net.shape) == 3: for i in range(net.shape[-1]): g_to_ig.append(ig.Graph.Weighted_Adjacency(net[:, :, i].tolist())) for n in range(net.shape[0]): for t in range(net.shape[-1]): g_to_ig[t].vs[n]['id'] = n elif len(net.shape) == 2: g_to_ig.append(ig.Graph.Weighted_Adjacency(net.tolist())) membership = [] louvain.set_rng_seed(seed) if interslice_weight != 0: for n in range(0, iter_n): mem, improvement = louvain.find_partition_temporal( g_to_ig, louvain_alg, interslice_weight=interslice_weight, **louvain_kwags) membership.append(mem) com_membership = np.array(membership).transpose([0, 2, 1]) else: com_membership = [] for n in range(0, iter_n): membership = [] for snapshot in g_to_ig: mem = louvain.find_partition(snapshot, louvain_alg, **louvain_kwags) membership.append(mem.membership) com_membership.append(membership) com_membership = np.array(com_membership).transpose([0, 2, 1]) if dict_input: C = teneto.utils.graphlet2contact(net, netinfo) C['communities'] = np.squeeze(com_membership) return C else: return np.squeeze(com_membership)
def louvain(adata, n_neighbors=None, resolution=None, n_pcs=50, random_state=0, restrict_to=None, key_added=None, flavor='vtraag', directed=True, recompute_pca=False, recompute_distances=False, recompute_graph=False, n_dcs=None, n_jobs=None, copy=False): """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. Parameters ---------- adata : :class:`~scanpy.api.AnnData` The annotated data matrix. n_neighbors : `int`, optional (default: 30) Number of neighbors to use for construction of knn graph. resolution : `float` or `None`, optional (default: 1) For the default flavor ('vtraag'), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. n_pcs : int, optional (default: 50) Number of PCs to use for computation of data point graph. random_state : int, optional (default: 0) Change the initialization of the optimization. key_added : str, optional (default: `None`) Key under which to add the cluster labels. restrict_to : tuple, optional (default: None) Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain (smp key, list of categories). flavor : {'vtraag', 'igraph'} Choose between to packages for computing the clustering. 'vtraag' is much more powerful. copy : `bool` (default: False) Copy adata or modify it inplace. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. louvain_groups : `pd.Series` (``adata.smp``, dtype `category`) Array of dim (number of samples) that stores the subgroup id ('0', '1', ...) for each cell. """ logg.info('running Louvain clustering', r=True) adata = adata.copy() if copy else adata add_or_update_graph_in_adata(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, n_dcs=n_dcs, recompute_pca=recompute_pca, recompute_distances=recompute_distances, recompute_graph=recompute_graph, n_jobs=n_jobs) adjacency = adata.uns['data_graph_norm_weights'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to if not isinstance(restrict_categories[0], str): raise ValueError('You need to use strings to label categories, ' 'e.g. \'1\' instead of 1.') restrict_indices = adata.smp[restrict_key].isin( restrict_categories).values adjacency = adjacency[restrict_indices, :] adjacency = adjacency[:, restrict_indices] if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warn( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.m(' using the undirected graph', v=4) g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if flavor == 'vtraag': import louvain if resolution is None: resolution = 1 try: logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution) # adata.uns['louvain_quality'] = part.quality() except AttributeError: logg.warn('Did not find package louvain>=0.6, ' 'the clustering result will therefore not ' 'be 100% reproducible, ' 'but still meaningful. ' 'If you want 100% reproducible results, ' 'update via "pip install louvain --upgrade".') part = louvain.find_partition(g, method='RBConfiguration', resolution_parameter=resolution) elif flavor == 'igraph': part = g.community_multilevel() groups = np.array(part.membership) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adata.uns['data_graph_distance_local']) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') unique_groups = np.unique(groups) n_clusters = len(unique_groups) if restrict_to is None: groups = groups.astype('U') adata.smp['louvain_groups'] = pd.Categorical( values=groups, categories=natsorted(unique_groups.astype('U'))) key_added = 'louvain_groups' if key_added is None else key_added else: key_added = restrict_key + '_R' if key_added is None else key_added groups += 1 adata.smp[key_added] = adata.smp[restrict_key].astype('U') adata.smp[key_added] += ',' adata.smp[key_added].iloc[restrict_indices] += groups.astype('U') adata.smp[key_added].iloc[~restrict_indices] += '0' adata.smp[key_added] = adata.smp[key_added].astype( 'category', categories=natsorted(adata.smp[key_added].unique())) adata.uns['louvain_params'] = np.array(( resolution, random_state, ), dtype=[('resolution', float), ('random_state', int)]) logg.info(' finished', t=True, end=': ') logg.info( 'found {} clusters and added\n' ' \'{}\', the cluster labels (adata.smp, dtype=category)'.format( n_clusters, key_added)) return adata if copy else None
def graph_clustering(adata, resolution=None, random_state=0, restrict_to=None, key_added=None, adjacency=None, flavor='vtraag', directed=True, use_weights=False, partition_type=None, partition_kwargs=None, copy=False): """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires to run :func:`~scanpy.api.pp.neighbors`, first. Parameters ---------- adata : :class:`~anndata.AnnData` The annotated data matrix. resolution : `float` or `None`, optional (default: 1) For the default flavor ('vtraag'), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state : `int`, optional (default: 0) Change the initialization of the optimization. restrict_to : `tuple`, optional (default: None) Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain (obs key, list of categories). key_added : `str`, optional (default: 'clustering') Key under which to add the cluster labels. adjacency : sparse matrix or `None`, optional (default: `None`) Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']`. flavor : {'vtraag', 'igraph','leiden'} Choose between to packages for computing the clustering. 'vtraag' is much more powerful than igraph, and the default. use_weights : `bool`, optional (default: `False`) Use weights from knn graph. partition_type : `~louvain.MutableVertexPartition`, optional (default: `None`) Type of partition to use. Only a valid argument if `flavor` is `'vtraag'`. partition_kwargs : `dict`, optional (default: `None`) Key word arguments to pass to partitioning, if `vtraag` method is being used. copy : `bool` (default: `False`) Copy adata or modify it inplace. Returns ------- None By default (`copy=False`), updates ``adata`` with the following fields: louvain : :class:`pandas.Series` (``adata.obs``, dtype `category`) Array of dim (number of samples) that stores the subgroup id ('0', '1', ...) for each cell. AnnData When `copy=True` is set, a copy of ``adata`` with those fields is returned. """ logg.info('running clustering', r=True) if (flavor not in {'vtraag', 'leiden'}) and (partition_type is not None): raise ValueError( '`partition_type` is only a valid argument when `flavour` is "vtraag" or "leiden"' ) adata = adata.copy() if copy else adata if adjacency is None and 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first to compute a neighborhood graph.' ) if adjacency is None: adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to if not isinstance(restrict_categories[0], str): raise ValueError('You need to use strings to label categories, ' 'e.g. \'1\' instead of 1.') for c in restrict_categories: if c not in adata.obs[restrict_key].cat.categories: raise ValueError( '\'{}\' is not a valid category for \'{}\''.format( c, restrict_key)) restrict_indices = adata.obs[restrict_key].isin( restrict_categories).values adjacency = adjacency[restrict_indices, :] adjacency = adjacency[:, restrict_indices] if flavor in {'vtraag', 'igraph', 'leiden'}: if flavor == 'igraph' and resolution is not None: logg.warn( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.m(' using the undirected graph', v=4) g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if use_weights: weights = np.array(g.es["weight"]).astype(np.float64) else: weights = None if flavor == 'leiden': import leidenalg if partition_kwargs is None: partition_kwargs = {} if partition_type is None: partition_type = leidenalg.ModularityVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights logg.info(' using the "leiden" package of Traag (2018)') #leidenalg.set_rng_seed(random_state) #part = leidenalg.find_partition(g, partition_type, **partition_kwargs) part = leidenalg.find_partition( g, leidenalg.ModularityVertexPartition, **partition_kwargs) #part = louvain.find_partition(g, partition_type, # **partition_kwargs) adata.uns['leiden_quality'] = part.quality() elif flavor == 'vtraag': import louvain if partition_kwargs is None: partition_kwargs = {} if partition_type is None: partition_type = louvain.RBConfigurationVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition(g, partition_type, **partition_kwargs) elif flavor == 'igraph': part = g.community_multilevel(weights=weights) groups = np.array(part.membership) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "leiden" or "vtraag" or "igraph" or "taynaud".' ) unique_groups = np.unique(groups) n_clusters = len(unique_groups) if restrict_to is None: groups = groups.astype('U') key_added = 'clustering' if key_added is None else key_added adata.obs[key_added] = pd.Categorical(values=groups, categories=natsorted( unique_groups.astype('U'))) else: key_added = restrict_key + '_R' if key_added is None else key_added all_groups = adata.obs[restrict_key].astype('U') prefix = '-'.join(restrict_categories) + ',' new_groups = [prefix + g for g in groups.astype('U')] all_groups.iloc[restrict_indices] = new_groups adata.obs[key_added] = pd.Categorical(values=all_groups, categories=natsorted( all_groups.unique())) adata.uns['clustering'] = {} adata.uns['clustering']['params'] = { 'resolution': resolution, 'random_state': random_state } logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint('found {} clusters and added\n' ' \'{}\', the cluster labels (adata.obs, categorical)'.format( n_clusters, key_added)) return adata if copy else None
maxflows = []; for year, H in zip(years, G_t): print('\tCalculating maxflow for {0}'.format(year)) maxflow = [(s['ccode'], t['ccode'], year, H.maxflow_value(s.index, t.index, capacity='weight')) for s in H.vs for t in H.vs if s != t] maxflows.extend(maxflow) #%% Write maxflow to file maxflow_df = pd.DataFrame(maxflows, columns=['source_ccode', 'target_ccode', 'year', 'maxflow']) maxflow_df.to_csv(output_dir + 'maxflow.csv', index=False) #%% Set seed for random number generator louvain.set_rng_seed(0) #%% Covert slices to layers interslice_weight = 0; G_intraslice, G_interslice, G_all = louvain.time_slices_to_layers(G_t, interslice_weight=interslice_weight, slice_attr='t', vertex_id_attr='ccode') #%% Do community detection print('\nDoing community detection...') n_repl = 100 resolutions = [0.6, 1.1, 1.7] for resolution in resolutions: memberships = []
def compare_algorithms(n, network_dict, log_file_name): network, network_oslom = network_dict['igraph'], network_dict['tuple'] all_partitions = {'Louvain':[], 'Directed Louvain':[], 'Leiden':[], 'Infomap':[], 'Oslom':[]} modularity_table = pd.DataFrame() size_table = pd.DataFrame() for i in range(0, n): # run alg n times louvain.set_rng_seed(i) # start = time.time() ### 1) directed Louvain partition_dl = run_louvain(network) all_partitions['Directed Louvain'].append(partition_dl) modularity_table.at[i, 'Directed Louvain'] = partition_dl.quality() size_table.at[i, 'Directed Louvain'] = len(partition_dl) # end = time.time() with open(log_file_name + ".txt", "a") as f: f.write('CD - dir_louvain -: ' + str(i) + ' TIME: ' + str(round((end-start)/60,4)) + '\n') # start = time.time() ### 2) directed Leiden partition_lei = run_leiden(network, i) all_partitions['Leiden'].append(partition_lei) modularity_table.at[i, 'Leiden'] = partition_lei.quality() size_table.at[i, 'Leiden'] = len(partition_lei) # end = time.time() with open(log_file_name + ".txt", "a") as f: f.write('CD - dir_leiden -: ' + str(i) + ' TIME: ' + str(round((end-start)/60,4)) + '\n') # start = time.time() ### 3) undirected Louvain # create an undirected netowork for comparison network_ud = directed_to_undirected(network) partition_l = run_louvain(network_ud) all_partitions['Louvain'].append(partition_l) modularity_table.at[i, 'Louvain'] = partition_l.quality() size_table.at[i, 'Louvain'] = len(partition_l) # end = time.time() with open(log_file_name + ".txt", "a") as f: f.write('CD - undir_louvain -: ' + str(i) + ' TIME: ' + str(round((end-start)/60,4)) + '\n') # start = time.time() ### 4) directed infomap partition_i = network.community_infomap(edge_weights = network.es['weight'], trials=1) all_partitions['Infomap'].append(partition_i) size_table.at[i, 'Infomap'] = len(set(partition_i.membership)) # modularity community_dict_infomap = get_community_dict(partition_i, network, filter_eu_members = False)['mod_dict'] modularity_table.at[i, 'Infomap'] = get_modularity(network, community_dict_infomap) # end = time.time() with open(log_file_name + ".txt", "a") as f: f.write('CD - infomap -: ' + str(i) + ' TIME: ' + str(round((end-start)/60,4)) + '\n') # start = time.time() ### 5) directed oslom clusters = run_oslom(network_oslom, i) all_partitions['Oslom'].append(clusters[0]) size_table.at[i, 'Oslom'] = clusters[0]['num_found'] # number of clusters found # modularity community_dict_oslom = get_community_dict_oslom(clusters[0], network, filter_eu_members = False)['mod_dict'] modularity_table.at[i, 'Oslom'] = get_modularity(network, community_dict_oslom) # end = time.time() with open(log_file_name + ".txt", "a") as f: f.write('CD - oslom -: ' + str(i) + ' TIME: ' + str(round((end-start)/60,4)) + '\n') # return {'size_table':size_table, 'modularity_table':modularity_table, 'all_partitions':all_partitions}
def louvain(adata, n_neighbors=30, resolution=None, n_pcs=50, random_state=0, flavor='vtraag', directed=True, recompute_pca=False, recompute_distances=False, recompute_graph=False, n_dcs=15, n_jobs=None, copy=False): """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. `[source] <tl.louvain_>`__ Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. *Examples:* See this `use case <17-05-05_>`__. .. _tl.louvain: https://github.com/theislab/scanpy/tree/master/scanpy/tools/louvain.py Parameters ---------- adata : AnnData The annotated data matrix. n_neighbors : int, optional (default: 30) Number of neighbors to use for construction of knn graph. resolution : float or None, optional For the default flavor ('vtraag'), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. n_pcs : int, optional (default: 50) Number of PCs to use for computation of data point graph. random_state : int, optional (default: 0) Change the initialization of the optimization. flavor : {'vtraag', 'igraph'} Choose between to packages for computing the clustering. 'vtraag' is much more powerful. copy : bool (default: False) References ---------- - implementation of Louvain algorithm: Traag, doi:10.5281/zenodo.35117 (2017) - Louvain algorithm: Blondel et al., J. Stat. Mech., P10008 (2008) - base graph package: Csardi et al., InterJournal Complex Systems, 1695 (2006) - basic suggestion for single-cell: Levine et al., Cell 162, 184-197 (2015) - combination with "attachedness" matrix: Wolf et al., bioRxiv (2017) """ logg.info('running Louvain clustering', r=True) adata = adata.copy() if copy else adata add_or_update_graph_in_adata(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, n_dcs=n_dcs, recompute_pca=recompute_pca, recompute_distances=recompute_distances, recompute_graph=recompute_graph, n_jobs=n_jobs) adjacency = adata.add['data_graph_norm_weights'] if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warn( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.m(' using the undirected graph', v=4) g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if flavor == 'vtraag': import louvain if resolution is None: resolution = 1 try: logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution) adata.add['louvain_quality'] = part.quality() except AttributeError: logg.warn( 'Did not find package louvain>=0.6, ' 'the clustering result will therefore not be 100% reproducible, ' 'but still meaningful! ' 'If you want 100% reproducible results, but louvain 0.6 is not yet ' 'available via "pip install louvain", ' 'either get the latest (development) version from ' 'https://github.com/vtraag/louvain-igraph or use the option ' '`flavor=igraph` in sc.tl.louvain(). ' 'The latter does not provide a `resolution` parameter, though.' ) part = louvain.find_partition(g, method='RBConfiguration', resolution_parameter=resolution) elif flavor == 'igraph': part = g.community_multilevel() groups = np.array(part.membership, dtype='U') elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adata.add['data_graph_distance_local']) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v groups = groups.astype('U') else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') adata.smp['louvain_groups'] = groups from natsort import natsorted adata.add['louvain_groups_order'] = np.array(natsorted(np.unique(groups))) adata.add['louvain_params'] = np.array((resolution, ), dtype=[('resolution', float)]) logg.m(' finished', t=True, end=' ') logg.m( 'and found', len(adata.add['louvain_groups_order']), 'clusters, added\n' ' "louvain_groups", the cluster labels (adata.smp)\n' ' "louvain_groups_order", the unique cluster labels (adata.add)') return adata if copy else None
def louvain(graph): #lv.set_rng_seed(0) lv.set_rng_seed(random.randint(1, 100000)) raw_partitions = lv.find_partition(graph, lv.ModularityVertexPartition) return raw_partitions
def run_louvain(graph, config_model='Default', overlap=False, directed=False, deep=False, interslice_weight=0.1, resolution_parameter=0.1, seed=None): """ :outdir: the output directory to comprehend the output link file :param graph: input file :param config_model: 'RB', 'RBER', 'CPM', 'Surprise', 'Significance' :param overlap: bool, whether to enable overlapping community detection :param directed :param deep :param interslice_weight :param resolution_parameter :return """ if seed != None: louvain.set_rng_seed(seed) def louvain_hierarchy_output(partition): optimiser = louvain.Optimiser() partition_agg = partition.aggregate_partition() partition_layers = [] while optimiser.move_nodes(partition_agg) > 0: partition.from_coarse_partition(partition_agg) partition_agg = partition_agg.aggregate_partition() partition_layers.append(list(partition)) return partition_layers def louvain_multiplex(graphs, partition_type, interslice_weight, resolution_parameter): layers, interslice_layer, G_full = louvain.time_slices_to_layers( graphs, vertex_id_attr='name', interslice_weight=interslice_weight) if partition_type == louvain.ModularityVertexPartition: partitions = [partition_type(H) for H in layers] interslice_partition = partition_type(interslice_layer, weights='weight') else: partitions = [ partition_type(H, resolution_parameter=resolution_parameter) for H in layers ] interslice_partition = partition_type( interslice_layer, resolution_parameter=resolution_parameter, weights='weight') optimiser = louvain.Optimiser() optimiser.optimise_partition_multiplex(partitions + [interslice_partition]) quality = sum( [p.quality() for p in partitions + [interslice_partition]]) return partitions[0], quality def partition_to_clust(graphs, partition, min_size_cut=2): clusts = [] node_names = [] if not isinstance(graphs, list): graphs = [graphs] for g in graphs: node_names.extend(g.vs['name']) for i in range(len(partition)): clust = [node_names[id] for id in partition[i]] clust = list(set(clust)) if len(clust) < min_size_cut: continue clust.sort() clusts.append(clust) clusts = sorted(clusts, key=lambda x: len(x), reverse=True) return clusts multi = False if isinstance(graph, list): multi = True if overlap == True and multi == False: multi = True net = graph graph = [] for i in range(4): graph.append(net) if multi == True and deep == True: sys.stderr.write('louvain does not support hierarchical ' 'clustering with overlapped communities\n') return 1 if config_model == 'RB': partition_type = louvain.RBConfigurationVertexPartition elif config_model == 'RBER': partition_type = louvain.RBERConfigurationVertexPartition elif config_model == 'CPM': partition_type = louvain.CPMVertexPartition elif config_model == 'Surprise': partition_type = louvain.SurpriseVertexPartition elif config_model == "Significance": partition_type = louvain.SignificanceVertexPartition else: sys.stderr.write("Configuration model not set " "performing simple Louvain.\n") partition_type = louvain.ModularityVertexPartition weighted = False if multi: wL = [] G = [] for file in graph: with open(file, 'r') as f: lines = f.read().splitlines() elts = lines[0].split() if len(elts) == 3: weighted = True else: weighted = False for i in range(len(lines)): elts = lines[i].split() for j in range(2): elts[j] = int(elts[j]) if weighted == True: elts[2] = float(elts[2]) if elts[2] < 0: sys.stderr.write('encountered a negative edge weight ' 'on row ' + str(i) + ' (' + str(lines[i]) + ') which is not allowed\n') return 2 lines[i] = tuple(elts) g = igraph.Graph.TupleList(lines, directed=directed, weights=weighted) G.append(g) wL.append(weighted) f.close() if True in wL and False in wL: raise Exception('all graphs should follow the same format') if partition_type == louvain.CPMVertexPartition and directed is True: raise Exception('graph for CPMVertexPartition must be undirected') if partition_type == louvain.SignificanceVertexPartition and weighted is True: raise Exception('SignificanceVertexPartition only support ' 'unweighted graphs') partition, quality = louvain_multiplex(G, partition_type, interslice_weight, resolution_parameter) else: if not os.path.isfile(graph): sys.stderr.write(str(graph) + ' is not a file\n') return 3 if os.path.getsize(graph) == 0: sys.stderr.write(str(graph) + ' is an empty file\n') return 4 with open(graph, 'r') as f: lines = f.read().splitlines() elts = lines[0].split() if len(elts) == 3: weighted = True else: weighted = False for i in range(len(lines)): elts = lines[i].split() for j in range(2): elts[j] = int(elts[j]) if weighted is True: elts[2] = float(elts[2]) if elts[2] < 0: sys.stderr.write('encountered a negative edge weight ' 'on row ' + str(i) + ' (' + str(lines[i]) + ') which is not allowed\n') return 3 lines[i] = tuple(elts) f.close() G = igraph.Graph.TupleList(lines, directed=directed, weights=weighted) if weighted is False: weights = None else: weights = G.es['weight'] if partition_type == louvain.ModularityVertexPartition: partition = partition_type(G, weights=weights) else: partition = partition_type( G, weights=weights, resolution_parameter=resolution_parameter) if deep == False: optimiser = louvain.Optimiser() optimiser.optimise_partition(partition) lines = [] if deep == False: clusts = partition_to_clust(G, partition) if len(clusts) == 0: sys.stderr.write(DEFAULT_ERR_MSG) return 4 maxNode = 0 for clust in clusts: maxNode = max(maxNode, max(clust)) for i in range(len(clusts)): lines.append( str(maxNode + len(partition) + 1) + '\t' + str(maxNode + i + 1)) for n in clusts[i]: lines.append(str(maxNode + i + 1) + '\t' + str(n)) else: partitions = louvain_hierarchy_output(partition) clusts_layers = [] for p in partitions: clusts_layers.append(partition_to_clust(G, p)) if len(clusts_layers) == 0: sys.stderr.write(DEFAULT_ERR_MSG) return 5 if len(clusts_layers[0]) == 0: sys.stderr.write(DEFAULT_ERR_MSG) return 6 maxNode = 0 for clust in clusts_layers[0]: maxNode = max(maxNode, max(clust)) for i in range(len(clusts_layers[0])): for n in clusts_layers[0][i]: lines.append(str(maxNode + i + 1) + '\t' + str(n)) maxNode = maxNode + len(clusts_layers[0]) for i in range(1, len(clusts_layers)): for j in range(len(clusts_layers[i - 1])): for k in range(len(clusts_layers[i])): if all(x in clusts_layers[i][k] for x in clusts_layers[i - 1][j]): lines.append( str(maxNode + k + 1) + '\t' + str(maxNode - len(clusts_layers[i - 1]) + j + 1)) break maxNode = maxNode + len(clusts_layers[i]) for i in range(len(clusts_layers[-1])): lines.append( str(maxNode + 1) + '\t' + str(maxNode - len(clusts_layers[-1]) + i + 1)) # trim the hierarchy to remove contigs up_tree = {} down_tree = {} for line in lines: elts = line.split() down_tree.setdefault(elts[0], []) down_tree[elts[0]].append(elts[1]) up_tree.setdefault(elts[1], []) up_tree[elts[1]].append(elts[0]) # store root and leaves set1 = set(down_tree.keys()) set2 = set(up_tree.keys()) root_l = list(set1.difference(set2)) leaf_l = list(set2.difference(set1)) node_l = list(set1.union(set2)) # find all contigs in the DAG Contigs = [] work_list = root_l visited = {} for node in node_l: visited[node] = 0 work_path = [] new_path = False while work_list: key = work_list.pop(0) if new_path == False: work_path.append(key) else: work_path.append(up_tree[key][visited[key]]) work_path.append(key) if key in leaf_l: new_path = True Contigs.append(work_path) work_path = [] elif len(down_tree[key]) > 1 or visited[key] > 0: new_path = True Contigs.append(work_path) work_path = [] if visited[key] == 0 and key not in leaf_l: work_list = down_tree[key] + work_list visited[key] += 1 # write trimmed DAG for path in Contigs[1:]: sys.stdout.write(path[0] + ',' + path[-1] + ',') if path[-1] in leaf_l: sys.stdout.write('c-m' + ';') else: sys.stdout.write('c-c' + ';') sys.stdout.flush() return 0