def markov_clustering(distance_mat, inflation): """ Runs the Markov Clustering algorithm on the input distance matrix. Inputs: DISTANCE_MAT: A (neurons x neurons) numpy matrix calculated by some distance metric. INFLATION: An int; the Hadamarde power to take during the inflation step. In general, values from 1.1 to 10.0 can be tried, with higher values generally resulting in more clusters. Inflation boosts the probabilities of intra-cluster walks and demotes inter-cluster walks. Outputs: CLUSTERS: A (neurons x neurons) numpy matrix of the final remaining clusters. Q: A float between [-1,1]; the modularity score associated with this clustering. Modularity measures the density of in-cluster edges to out-of-cluster edges. Specifically, it is the fraction of edges that fall within the clusters minus the expected fraction if edges were randomly distributed. """ G = nx.from_numpy_matrix(distance_mat) sparse_G = nx.to_scipy_sparse_matrix(G) result = mc.run_mcl(sparse_G, inflation=inflation) clusters = mc.get_clusters(result) Q = mc.modularity(matrix=result, clusters=clusters) return clusters, Q
def test_modularity(): source = np.matrix(test_matrices[4][0]) target = test_matrices[4][1] clusters = mc.get_clusters(mc.run_mcl(source)) quality = mc.modularity(source, clusters) assert np.isclose(quality, target)
def mcl(graph, viz=False): mat = nx.to_numpy_matrix(graph) mod = -1 for val in np.arange(1.2,3,0.1): res = mc.run_mcl(mat, inflation=val) clust = mc.get_clusters(res) q = mc.modularity(matrix=np.asmatrix(res), clusters=clust) if q > mod: clusters = clust if viz == False: labels = dict(zip(range(len(graph)),graph.nodes())) return[[labels.get(item) for item in clust] for clust in clusters] else: plt.figure(num=None, figsize=(20,20), dpi=50) pos = nx.spring_layout(graph) mc.draw_graph(mat, clusters, node_size=200, with_labels=False, edge_color="silver")
def clusters_mcl(oid, evs, inf=inf): # MCL clustering: create network logging.info("%s Create network" % oid) evs_e = evs[["in_gene","out_gene","branch_support"]] evs_e = evs_e[evs_e["branch_support"] > min_support] evs_n = nx.convert_matrix.from_pandas_edgelist(evs_e, source="in_gene", target="out_gene", edge_attr="branch_support") evs_n_nodelist = [ node for i, node in enumerate(evs_n.node()) ] evs_m = nx.to_scipy_sparse_matrix(evs_n, nodelist=evs_n_nodelist) # MCL clustering: run clustering # inf,_ = optimise_inflation(matrix=evs_m) logging.info("%s MCL clustering, inflation = %f" % (oid, inf)) mcl_m = markov_clustering.run_mcl(evs_m, inflation=inf) mcl_c = markov_clustering.get_clusters(mcl_m) logging.info("%s MCL clustering, num clusters = %i" % (oid, len(mcl_c))) # plt.figure(figsize=(10,10)) #markov_clustering.draw_graph(mcl_m, mcl_c, node_size=20, with_labels=False, edge_color="silver", cmap="Accent") # plt.savefig('graph.pdf') # MCL clustering: save output mcl_c_clu = [ i for i, cluster in enumerate(mcl_c) for node in cluster] mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster] clu = pd.DataFrame( { "node" : [evs_n_nodelist[i] for i in mcl_c_noi], "cluster" : mcl_c_clu, }, columns=["node","cluster"]) clu["cluster"] = clu["cluster"].astype(str) logging.info("%s MCL clustering, num clustered genes = %i" % (oid, len(clu))) return clu
def cluster(self, collection, vectorizer) -> List[Cluster]: articles = list(collection.articles()) texts = ['{} {}'.format(a.title, a.text) for a in articles] try: X = vectorizer.transform(texts) except: X = vectorizer.fit_transform(texts) times = [a.time for a in articles] print('temporal graph...') S = self.temporal_graph(X, times) #print('S shape:', S.shape) print('run markov clustering...') result = mc.run_mcl(S) print('done') idx_clusters = mc.get_clusters(result) idx_clusters.sort(key=lambda c: len(c), reverse=True) print(f'times: {len(set(times))} articles: {len(articles)} ' f'clusters: {len(idx_clusters)}') clusters = [] for c in idx_clusters: c_vectors = [X[i] for i in c] c_articles = [articles[i] for i in c] Xc = sparse.vstack(c_vectors) centroid = sparse.csr_matrix(Xc.mean(axis=0)) cluster = Cluster(c_articles, c_vectors, centroid=centroid) clusters.append(cluster) return clusters
def score(G): FF = G.copy() for k, v in FF.edges.items(): FF.edges[k]['weight'] = 1. / FF.edges[k]['weight'] M = nx.adjacency_matrix(FF).todense() N = M.shape[0] SC = SpectralClustering(n_clusters=2, affinity='precomputed') clustering = SC.fit(M) y_sc = clustering.labels_ + 1 chinese_whispers(FF, iterations=100) result = mc.run_mcl(M, inflation=2) clusters = mc.get_clusters(result) y_mc = mc_pred(FF, clusters) y = np.zeros_like(clustering.labels_) y_cw = [] for i, x in enumerate(FF.nodes): if FF.nodes[x]['color'] == 'blue': y[i] = 1 elif FF.nodes[x]['color'] == 'red': y[i] = 2 y_cw.append(FF.nodes[x]['label']) labels = {x: i for i, x in enumerate(set(y_cw))} y_cw = [labels[x] for x in y_cw] y_cw = np.array(y_cw) print(*scores(y, y_cw), nmi(y, y_cw)) print(*scores(y, y_sc), nmi(y, y_sc)) print(*scores(y, y_mc), nmi(y, y_mc)) return ((*scores(y, y_cw), nmi(y, y_cw)), (*scores(y, y_sc), nmi(y, y_sc)), (*scores(y, y_mc), nmi(y, y_mc)))
def clusters_mcl(phy_fn, phy_id, evou_d): # MCL clustering: create network logging.info("%s Create network" % phy_id) evou_e = evou_d[["in_gene","out_gene","branch_support"]] evou_n = networkx.convert_matrix.from_pandas_edgelist(evou_e, source="in_gene", target="out_gene", edge_attr="branch_support") evou_n_nodelist = [ node for i, node in enumerate(evou_n.node()) ] evou_m = networkx.to_scipy_sparse_matrix(evou_n, nodelist=evou_n_nodelist) # MCL clustering: run clustering # inf,_ = optimise_inflation(matrix=evou_m) logging.info("%s MCL clustering, inflation = %f" % (phy_id, inf)) mcl_m = markov_clustering.run_mcl(evou_m, inflation=inf) mcl_c = markov_clustering.get_clusters(mcl_m) logging.info("%s MCL clustering, num clusters = %i" % (phy_id, len(mcl_c))) # markov_clustering.draw_graph(mcl_m, mcl_c, node_size=50, with_labels=True, edge_color="k", cmap="Accent") # MCL clustering: save output mcl_c_clu = [ i for i, cluster in enumerate(mcl_c) for node in cluster] mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster] mcl_c_out = pd.DataFrame( { "node" : [evou_n_nodelist[i] for i in mcl_c_noi], "cluster" : mcl_c_clu, }, columns=["node","cluster"]) mcl_c_out["cluster"] = mcl_c_out["cluster"].astype(str) logging.info("%s MCL clustering, num clustered genes = %i" % (phy_id, len(mcl_c_out))) return mcl_c_out
def fit(self, inflation=2.0): if self.result is None: self.result = mc.run_mcl(self.matrix, inflation=inflation) self.clusters = mc.get_clusters(self.result) print(self.clusters[0][0]) print("number of clusters:", len(self.clusters)) return self.clusters
def mcl_parameter_QC(network, range_from=15, range_to=26): matrix = nx.to_scipy_sparse_matrix(network) # perform clustering using different inflation values from 1.5 and 2.5 # for each clustering run, calculate the modularity for inflation in [i / 10 for i in range(range_from, range_to)]: result = mc.run_mcl(matrix, inflation=inflation) clusters = mc.get_clusters(result) Q = mc.modularity(matrix=result, clusters=clusters) print("inflation:", inflation, "modularity:", Q)
def MCL(cdr3, edgelist=None, mcl_hyper=[1.2, 2], outfile=None): """ Perform clustering on a network of CDR3 amino acid sequences with a known hamming distance, using the Markov clustering (MCL) algorithm. For more info about the inflation and expansion parameters, visit: https://micans.org/mcl/ Parameters ---------- edgelist : set, optional Tab-separated edgelist. The default is None. mcl_hyper : list, optional MCL hyperparameters: inflation and expansion. The default is [1.2,2]. outfile : str, optional Name of outfile. The default is None. Returns ------- clusters : pd.DataFrame pd.DataFrame containing two columns: 'CDR3' and 'cluster'. The first column contains CDR3 sequences, the second column contains the corresponding cluster ids. """ if edgelist is None: edgelist = create_edgelist(cdr3) try: G = nx.parse_adjlist(edgelist, nodetype=str) m = nx.to_scipy_sparse_array(G) # Run MCL result = mcl.run_mcl(m, inflation=mcl_hyper[0], expansion=mcl_hyper[1]) mcl_output = mcl.get_clusters(result) identifiers = list(G.nodes()) # Map cluster ids back to seqs cluster_ids = dict() for i in range(len(mcl_output)): cluster_ids[i] = list(identifiers[i] for i in mcl_output[i]) # Generate nodelist clusters = {"CDR3": [], "cluster": []} for c in cluster_ids: for seq in cluster_ids[c]: clusters["CDR3"].append(seq) clusters["cluster"].append(c) clusters = pd.DataFrame(data=clusters) # Write to file if outfile is not None: clusters.to_csv(outfile, sep="\t", index=False) except nx.NetworkXError: clusters = pd.DataFrame({"CDR3": [], "cluster": []}) return clusters
def cluster_graph(graph, infl): """ A naive markov clustering based only on inflation value. """ matrix = nx.to_scipy_sparse_matrix(graph) #get adjacency matrix in sparse form m = normalize(matrix, norm='l1', axis=1) result = mc.run_mcl(m, inflation=infl) #MCL algorithm with default parameters, inflation of 2 # result = mc.run_mcl(matrix, inflation=1.5) #MCL algorithm with inflation of 1.5 (coarser clustering) clusters = mc.get_clusters(result) #gets clusters return m, clusters, infl
def optimise_inflation(matrix, start=1.1, end=2.5, step=0.1): I_lis = np.arange(start, end, step).tolist() Q_lis = np.zeros(shape=(len(I_lis),1)) for n,I in enumerate(I_lis): result = markov_clustering.run_mcl(matrix, inflation=I) clusters = markov_clustering.get_clusters(result) Q = markov_clustering.modularity(matrix=result, clusters=clusters) Q_lis[n] = Q max_Q_index = np.argmax(Q_lis) # return inflation with maximum modularity and modularities array return I_lis[max_Q_index], Q_lis[max_Q_index]
def clusters_mclw(evs, node_list, inf=inflation, verbose=True): import markov_clustering import networkx as nx if len(evs) > 0: # MCL clustering: create network if verbose: logging.info("Create network") evs_e = evs[["in_gene", "out_gene", "branch_support"]] evs_e.columns = ["in_gene", "out_gene", "weight"] evs_n = nx.convert_matrix.from_pandas_edgelist(evs_e, source="in_gene", target="out_gene", edge_attr="weight") evs_n.add_nodes_from(node_list) evs_n_nodelist = [node for i, node in enumerate(evs_n.nodes())] evs_m = nx.to_scipy_sparse_matrix(evs_n, nodelist=evs_n_nodelist) # MCL clustering: run clustering if verbose: logging.info("MCL weighted clustering, inflation = %.3f" % (inf)) mcl_m = markov_clustering.run_mcl( evs_m, inflation=inf, pruning_threshold=0) #why pruning threshold HAS to be zero? mcl_c = markov_clustering.get_clusters(mcl_m) if verbose: logging.info("MCL weighted clustering, num clusters = %i" % (len(mcl_c))) # MCL clustering: save output mcl_c_clu = [i for i, cluster in enumerate(mcl_c) for node in cluster] mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster ] mcl_c_nod = [evs_n_nodelist[i] for i in mcl_c_noi] else: # MCL clustering: create network if verbose: logging.info("There are no speciation events in this tree.") mcl_c_nod = node_list mcl_c_clu = [i for i in range(len(node_list))] # output clu = pd.DataFrame({ "node": mcl_c_nod, "cluster": mcl_c_clu, }, columns=["node", "cluster"]) if verbose: logging.info("MCL clustering, num clustered genes = %i" % (len(clu))) return clu
def markov_clustering(JSONin): nx_graph = LengthCluster.make_graph_from_JSONin(JSONin) matrix = nx.to_scipy_sparse_matrix(nx_graph) print('clustreing...') result = mc.run_mcl(matrix) # run MCL with default parameters clusters = mc.get_clusters(result) transformed_clusters = LengthCluster.cluster_transform( cluster_list=clusters, nx_graph=nx_graph) # print(clusters) # print('drawing...') # plt.rcParams["figure.figsize"] = (40, 40) # mc.draw_graph(matrix, clusters) return transformed_clusters
def cluster_traces(traces: pd.Series, expansion: int, inflation: float) -> None: corpus = create_corpus(traces) trace2vec_fn = partial(vectorize_trace, corpus) # activity_ids, activities, max_len = vectorize_activities(traces, corpus) mat = create_similarity_matrix(corpus, traces, trace2vec_fn) result = mcl.run_mcl(mat, expansion=expansion, inflation=inflation, loop_value=0) clusters = mcl.get_clusters(result) # get clusters return clusters
def community_detection(edge_index, num_nodes, edge_attr=None, method='mcl'): """Detects clusters of nodes based on the edge attributes (distances) Args: edge_index (Tensor): Edge index num_nodes (int): Number of nodes edge_attr (Tensor, optional): Edge attributes. Defaults to None. method (str, optional): method. Defaults to 'mcl'. Raises: ValueError: Requires a valid clustering method ('mcl' or 'louvain') Returns: cluster Tensor """ # make the networkX graph g = nx.Graph() g.add_nodes_from(range(num_nodes)) for iedge, (i, j) in enumerate(edge_index.transpose(0, 1).tolist()): if edge_attr is None: g.add_edge(i, j) else: g.add_edge(i, j, weight=edge_attr[iedge]) # get the device device = edge_index.device # detect the communities using Louvain detection if method == 'louvain': cluster = community.best_partition(g) return torch.tensor([v for k, v in cluster.items()]).to(device) # detect the communities using MCL detection elif method == 'mcl': matrix = nx.to_scipy_sparse_matrix(g) result = mc.run_mcl(matrix) # run MCL with default parameters clusters = mc.get_clusters(result) # get clusters index = np.zeros(num_nodes).astype('int') for ic, c in enumerate(clusters): index[list(c)] = ic return torch.tensor(index).to(device) else: raise ValueError('Clustering method %s not supported' % method)
def get_clusters(df,inflation=1.7,p=95,ntopics=5): """ Takes as an input the dataframe containing articles with their abstract, vectorize it, then returns the label as well as coordinates of the clusters centers. Inputs: ------ df: the dataframe containing the data for the articles inflation: parameter that enter in the clustering algorithm. Increase value for more clusters. default typically yield ~10 clusters. p: percentile for the threshold. ntopics: the number of keywords for each cluster Outputs: ------ adjacency_matrix: the adjacency matrix df_vectorized: dataframe with each abstract cleaned and vectorized. df_label: the original dataframe with a new column containing the labels most_frequentwords_dict: a dict relating each labels to its highest idf words with their weight """ df_vectorized = prepare_vector(df) df_label = df.copy() cosine_distance_matrix = get_distance_matrix(df_vectorized) adjacency_matrix = get_adjacency_matrix(cosine_distance_matrix,p=p) df_adjacency_matrix = pd.DataFrame(adjacency_matrix,columns=df_label.index,index=df_label.index) G = nx.from_numpy_matrix(adjacency_matrix) matrix = nx.to_scipy_sparse_matrix(G) result = mc.run_mcl(matrix, inflation=inflation) clusters = mc.get_clusters(result) index_df = df_vectorized.index clusters_recid = [index_df[np.array(cluster)] for cluster in clusters] for i,cluster in enumerate(clusters_recid): df_label.loc[cluster,'Label'] = i most_frequentwords_dict = get_highest_idf_words_title(df_label,ntopics=ntopics) return df_adjacency_matrix,df_vectorized,df_label,most_frequentwords_dict
def export_node_list_withimage(encode_data, name_data, matrix_form=False, thresh=5, inflation=1.5, header=True, delim=',', label=None, filename='node_list.csv'): print('building graph') if matrix_form: numnodes = encode_data.shape[0] matrix = encode_data else: numnodes = len(encode_data) positions = {i: encode_data[i] for i in range(numnodes)} # use networkx to generate the graph network = nx.random_geometric_graph(numnodes, thresh, pos=positions) # then get the adjacency matrix (in sparse form) matrix = nx.to_scipy_sparse_matrix(network) print('runing mcl') # result = mc.run_mcl(matrix, inflation=inflation) clusters = mc.get_clusters(result) mc.draw_graph(matrix, clusters, node_size=50, with_labels=False, edge_color="silver") if label == None: label = [str(i) for i in range(numnodes)] f = open("node_list.csv", 'w') if header: f.write("Id,Label,Cluster-ID,image\n") i = 1 for j in clusters: for node in j: pos = label[node].find('!') pos2 = name_data[node].find('RTW') sent = "\"" + label[node] + "\"" + delim + "\"" + label[ node][:pos] + "\"" + delim + str(i) + delim + "\"" + name_data[ node][pos2 + 4:-4] + ".png" + "\"" + "\n" f.write(sent) i = i + 1 f.close()
def graph_clustering(graph,cluster_rate = 1.5, draw = False): AS_Num = 0 n_Matrix = nx.to_scipy_sparse_matrix(graph) result = mc.run_mcl(n_Matrix,inflation = cluster_rate) clusters = mc.get_clusters(result) print("Number of AS: " + str(len(clusters))) graph.graph['Total_AS'] = len(clusters) for c in clusters: for n_id in c: graph.add_node(n_id,AS_N = AS_Num) AS_Num += 1 if(draw): mc.draw_graph(n_Matrix, clusters, node_size=10, with_labels=False, edge_color="black",width=0.2) plt.show()
def community_detection_per_batch(edge_index, batch, num_nodes, edge_attr=None, method='mcl'): # make the networkX graph g = nx.Graph() g.add_nodes_from(range(num_nodes)) for iedge, (i, j) in enumerate(edge_index.transpose(0, 1).tolist()): if edge_attr is None: g.add_edge(i, j) else: g.add_edge(i, j, weight=edge_attr[iedge]) num_batch = max(batch) + 1 all_index = range(num_nodes) cluster, ncluster = [], 0 for iB in range(num_batch): index = torch.tensor(all_index)[batch == iB].tolist() subg = g.subgraph(index) # detect the communities using Louvain method if method == 'louvain': c = community.best_partition(subg) cluster += [v + ncluster for k, v in c.items()] ncluster = max(cluster) # detect communities using MCL elif method == 'mcl': matrix = nx.to_scipy_sparse_matrix(subg) result = mc.run_mcl(matrix) # run MCL with default parameters mc_clust = mc.get_clusters(result) # get clusters index = np.zeros(subg.number_of_nodes()).astype('int') for ic, c in enumerate(mc_clust): index[list(c)] = ic + ncluster cluster += index.tolist() ncluster = max(cluster) else: raise ValueError('Clustering method %s not supported' % method) # return device = edge_index.device return torch.tensor(cluster).to(device)
def community_detection(edge_index, num_nodes, edge_attr=None, method='mcl'): # make the networkX graph g = nx.Graph() g.add_nodes_from(range(num_nodes)) for iedge, (i, j) in enumerate(edge_index.transpose(0, 1).tolist()): if edge_attr is None: g.add_edge(i, j) else: g.add_edge(i, j, weight=edge_attr[iedge]) # get the device device = edge_index.device # detect the communities using Louvain detection if method == 'louvain': cluster = community.best_partition(g) return torch.tensor([v for k, v in cluster.items()]).to(device) # detect the communities using MCL detection elif method == 'mcl': #print('') #t0 = time() matrix = nx.to_scipy_sparse_matrix(g) #print('__ scipy %f' %(time()-t0)) #t0 = time() result = mc.run_mcl(matrix) # run MCL with default parameters #print('__ run %f' %(time()-t0)) #t0 = time() clusters = mc.get_clusters(result) # get clusters #print('__ cluster %f' %(time()-t0)) #t0 = time() index = np.zeros(num_nodes).astype('int') for ic, c in enumerate(clusters): index[list(c)] = ic #print('__ process %f' %(time()-t0)) return torch.tensor(index).to(device) else: raise ValueError('Clustering method %s not supported' % method)
def cluster(self, collection, vectorizer, sbert=False) -> List[Cluster]: articles = list(collection.articles()) texts = ['{} {}'.format(a.title, a.text) for a in articles] if sbert: X = vectorizer.encode(texts, batch_size=128, show_progress_bar=True, device='cpu', num_workers=24) else: try: X = vectorizer.transform(texts) except: X = vectorizer.fit_transform(texts) times = [a.time for a in articles] print('temporal graph...') S = self.temporal_graph(X, times) #print('S shape:', S.shape) print('run markov clustering...') result = mc.run_mcl(S) print('done') idx_clusters = mc.get_clusters(result) idx_clusters.sort(key=lambda c: len(c), reverse=True) print(f'times: {len(set(times))} articles: {len(articles)} ' f'clusters: {len(idx_clusters)}') clusters = [] for c in idx_clusters: c_vectors = [X[i] for i in c] c_articles = [articles[i] for i in c] if sparse.issparse(c_vectors): Xc = sparse.vstack(c_vectors) centroid = sparse.csr_matrix(Xc.mean(axis=0)) else: Xc = np.vstack(c_vectors) centroid = np.mean(Xc, axis=0) cluster = Cluster(c_articles, c_vectors, centroid=centroid) clusters.append(cluster) return clusters
def export_cluster_dirs(encode_data, name_data, matrix_form=False, thresh=5, inflation=1.5): print('building graph') if matrix_form: numnodes = encode_data.shape[0] matrix = encode_data else: numnodes = len(encode_data) positions = {i: encode_data[i] for i in range(numnodes)} # use networkx to generate the graph network = nx.random_geometric_graph(numnodes, thresh, pos=positions) # then get the adjacency matrix (in sparse form) matrix = nx.to_scipy_sparse_matrix(network) # print('runing mcl') # result = mc.run_mcl(matrix, inflation=inflation) clusters = mc.get_clusters(result) i = 1 import shutil, os try: shutil.rmtree("cluster") except: pass os.makedirs("cluster") for j in clusters: if len(j) == 1: continue file = "cluster/{}".format(i) os.makedirs(file) for node in j: pos = name_data[node].find('RTW') shutil.copyfile( name_data[node], os.path.join(file, str(node) + name_data[node][pos + 4:])) i = i + 1
def clusters_mcl(evd, inf=1.1): # MCL clustering: create network evou_e = evd[["in_gene","out_gene","branch_support"]] evou_n = networkx.convert_matrix.from_pandas_edgelist(evou_e, source="in_gene", target="out_gene", edge_attr="branch_support") evou_n_nodelist = [ node for i, node in enumerate(evou_n.node()) ] evou_m = networkx.to_scipy_sparse_matrix(evou_n, nodelist=evou_n_nodelist) # MCL clustering: run clustering mcl_m = markov_clustering.run_mcl(evou_m, inflation=inf) mcl_c = markov_clustering.get_clusters(mcl_m) # MCL clustering: save output mcl_c_clu = [ i for i, cluster in enumerate(mcl_c) for node in cluster] mcl_c_noi = [ node for i, cluster in enumerate(mcl_c) for node in cluster] clu = pd.DataFrame( { "node" : [evou_n_nodelist[i] for i in mcl_c_noi], "cluster" : mcl_c_clu, }, columns=["node","cluster"]) clu["cluster"] = clu["cluster"].astype(str) return clu
def markov_cluster(mark_array): # Send square matrix to markov clustering algorithm result = mc.run_mcl(mark_array, inflation=1.5) # run MCL with default parameters clusters = mc.get_clusters(result) #print("results of cluster: ", result) print("clusters: ", clusters) cluster_array = [] cluster_array = np.asarray(clusters) print("size of cluster: ", cluster_array.shape) # Test to choose the best inflation point # for inflation in [i/10 for i in range (15,26)]: # result = mc.run_mcl(mark_array, inflation = inflation) # clusters = mc.get_clusters(result) # q = mc.modularity(mark_array=result, clusters=clusters) # print("Inflation: ", inflation, "modularity: ", q) #mc.draw_graph(mark_array, clusters, node_size=50, with_labels=True, edge_color="silver") print("Successful MCL")
def cluster_plot(adjacent, title, pos, inflation=1.3, filename=None, labels=None, label_flag=True, node_size=150, figsize=(6, 6), use_nodeaslabel=False, width=1): fig, ax = plt.subplots(1, 1, sharey=False, sharex=False, figsize=figsize) result = mc.run_mcl(adjacent, inflation=inflation) clusters = mc.get_clusters(result) # get clusters plt.title(title) graph = nx.Graph(adjacent) clusters = complete_cluster(clusters, graph.nodes()) if pos is None: pos = nx.spring_layout(graph, iterations=100) mc.draw_graph(adjacent, clusters, pos=pos, with_labels=False, edge_color="silver", node_size=node_size, width=width) if labels is None: if use_nodeaslabel: labels = {n: n for ni, n in enumerate(graph.nodes())} else: labels = {n: ci for ci, c in enumerate(clusters) for n in c} if pos is None: pos = nx.spring_layout(graph, iterations=100) if label_flag: nx.draw_networkx_labels(graph, pos, labels=labels) if not filename is None: plt.savefig(filename, bbox_inches='tight') else: plt.show(block=False) return clusters, pos, labels
def cluster_main(args: argparse.Namespace): paf = PAF.from_file(args.inpaf) G = paf_to_graph(paf, min_cov=args.min_cov) nodes = list(G.nodes()) matrix = nx.to_scipy_sparse_matrix(G, nodelist=nodes) result = mc.run_mcl(matrix, expansion=args.expansion, inflation=args.inflation) clusters = mc.get_clusters(result) named_clusters = [[nodes[i] for i in cl] for cl in clusters] for i, cluster_members in enumerate(named_clusters, 1): for member in cluster_members: print(f"{i}\t{member}", file=args.outfile) if args.plot is not None: plot_clusters(args.plot, matrix, clusters, args.plot_height, args.plot_width, args.plot_dpi) return
def test_clustering_structure(n_runs=20): nmis_gt = [] nmis_mcl = [] nmis_louvain = [] for i in range(n_runs): print("Run number {0}".format(i)) ensemble_density_huge("file.csv", "\\t") dist_dense = pd.read_csv("./matrix.csv", delimiter="\t", header=None).values dist_dense = dist_dense[:, :-1] scaler = QuantileTransformer(n_quantiles=10) dist_dense_scaled = scaler.fit_transform(dist_dense) results_dense = TSNE( metric="precomputed").fit_transform(dist_dense_scaled) model_kmeans = KMeans(n_clusters=len(set(true))) labels_dense_kmeans = model_kmeans.fit_predict(results_dense) clusters_mcl = [0 for i in range(len(adj))] result_mcl = mc.run_mcl(adj) # run MCL with default parameters clusters = mc.get_clusters(result_mcl) # get clusters i = 0 for cluster in clusters: for j in cluster: clusters_mcl[j] = i i += 1 partition = louvain.best_partition(G) labels_spectral = [v for k, v in partition.items()] nmis_gt.append( nmi(labels_dense_kmeans, true, average_method="arithmetic")) nmis_mcl.append(nmi(clusters_mcl, true, average_method="arithmetic")) nmis_louvain.append( nmi(labels_spectral, true, average_method="arithmetic")) print("GT : {0}, {1}".format(np.mean(nmis_gt), np.std(nmis_gt))) print("MCL : {0}, {1}".format(np.mean(nmis_mcl), np.std(nmis_mcl))) print("Louvain : {0}, {1}".format(np.mean(nmis_louvain), np.std(nmis_louvain))) return ((nmis_gt, nmis_mcl, nmis_louvain))
def RunMCL(graph): """ run markove clustering once """ # get adjacency matrix and mapping, then run mcl adj_mat, idx_to_token_mapping = AdjacencyMatrix(graph) adj_mat = adj_mat.toarray() # from sparse to np array mcl_clustering = mc.run_mcl(adj_mat, inflation=2) clusters = mc.get_clusters(mcl_clustering) # get token representation of the clusters token_clusters = list() for cl in clusters: cl_tokens = list() for idx in cl: cl_tokens.append(idx_to_token_mapping[idx]) token_clusters.append(cl_tokens) # check if repeating node exists due to isomorphic graph structures num_nodes_in_clusters = sum(len(c) for c in token_clusters) if graph.number_of_nodes() != num_nodes_in_clusters: token_clusters, modularity = EnforceOneToOneMapping(graph, token_clusters) else: modularity = Modularity(graph, token_clusters) return token_clusters, modularity
def find_clusters(embeddings, similarity_threshold, inflation): """ Identify clusters within embedded data. :param embeddings: embedded data :type embeddings: numpy array-like :param similarity_threshold: Can be used to tune the clustering performance. :type similarity_threshold: real number :param inflation: Markov clustering inflation. Used to control the granularity of the clustering. Low values give fewer, larger clusters. Higher values give more, smaller clusters. :type inflation: real number :return: The identified clusters :rtype: dict """ dists = distances_from_embeddings(embeddings) similarity = similarity_from_dists(dists, similarity_threshold) results = mc.run_mcl(similarity, inflation=inflation) clusters = sorted(mc.get_clusters(results), key=lambda x: len(x), reverse=True) return clusters