def compute_eval_metrics(G, array_connection, clustering, labels=None): if labels is None: return (nx.community.quality.modularity(G, get_partitions(G, clustering)), modularity_density(array_connection, clustering, np.unique(clustering)), len(np.unique(clustering))) else: from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score aux = [] aux_c = [] for key in G.node: aux.append(labels[key]) aux_c.append(clustering[key]) clustering = aux_c labels = aux labels = np.array(labels, dtype=int) return ((nx.community.quality.modularity(G, get_partitions(G, clustering)), modularity_density(array_connection, clustering, np.unique(clustering)), len(np.unique(clustering))), (adjusted_rand_score(labels, clustering), adjusted_mutual_info_score(labels, clustering)))
def detect(rel_df, resolution=0.0): df = rel_df.copy(deep=True) df = df.rename(mapper=lambda name: name.lower(), axis='columns') G_original = nx.from_pandas_edgelist(df, edge_attr=['weight', 'change']) G = nx.relabel.convert_node_labels_to_integers(G_original) # Ensure everyone is present, even if no edge points towards them # if nodes is not None: # for node in nodes: # if G.has_node(node) == False: # G.add_node(node) adj = nx.to_scipy_sparse_matrix(G) density = False c = None if density: c = fine_tuned_clustering_qds(G) else: c = fine_tuned_clustering_q(G, r=resolution) Q = metrics.modularity_r(adj, c, np.unique(c), r=resolution) D = metrics.modularity_density(adj, c, np.unique(c)) print("Communities:", c) print("#comms: ", len(np.unique(c)), 'given by', np.unique(c)) print("Modularity Q:", Q) print("Modularity D:", D) print("Mode: ", 'D' if density else 'Q') for i, node in enumerate(G_original.nodes()): G_original.add_node(node, modularity_class=str(c[i])) # print(G_original.nodes()) return { 'graph': G_original, 'communities': c, 'Q': Q, 'D': D, }
def forced_split_communities_qds(adj, c, cluster_size, normalize, evd_method, tolerence, seed): """Force splits the communities in graph, if the size of the first_community is greater than the threshold, such that the splitting least compromizes modularity density. Parameters ---------- adj : SciPy sparse matrix (csr or csc) The N x N Adjacency matrix of the graph. c : Integer array Current array of community labels for the nodes in the graph as ordered by the adjacency matrix. cluster_size : integer Threshold/maximum size (number of nodes) of a cluster. normalize : bool Whether the normalized Laplacian matrix is used. evd_method : string Method of eigenvalue computation. It should be one of 'tracemin' (TraceMIN), 'lanczos' (Lanczos iteration) and 'lobpcg' (LOBPCG). tolerence : float Tolerance of relative residual in eigenvalue computation. seed : integer, random_state, or None Indicator of random number generation state. Returns ------- Integer array Array of community labels, as a result of splitting, for the nodes in the graph as ordered by the adjacency matrix. """ # Array of unique community labels unique_clusters = np.unique(c) # Tracks the nodes in each community dict_bool = {} # Tracks the clusters that are connected to each community dict_connected = {} for label in unique_clusters: # Track the nodes in each community dict_bool[label] = (c == label) # Initialize each key to an empty set dict_connected[label] = set() # Track the clusters that are connected to each community for comm1 in unique_clusters[:-1]: # index of the community 'comm1' i = np.where(unique_clusters == comm1)[0][0] bool_1 = dict_bool[comm1] adj_comm1 = adj[bool_1] # Track the clusters that are connected to community 'comm1' for comm2 in unique_clusters[i+1:]: bool_2 = dict_bool[comm2] zero = np.zeros(len(c), dtype=int) zero[bool_2] = 1 # Check if 'comm2' is connected to 'comm1' if ((adj_comm1.dot(zero)).sum()) != 0: dict_connected[comm1].add(comm2) dict_connected[comm2].add(comm1) # Create a copy of cluster labels c_new = c.copy() # Split each community, whose size is greater than the threshold for cluster_num in unique_clusters: bool_r = dict_bool[cluster_num] # Sparse adjacency matrix corresponding to 'cluster_num' sub_adj = adj[bool_r].T[bool_r] # Subgraph constructed from sparse adjacency matrix of 'cluster_num' g = nx.from_scipy_sparse_matrix(sub_adj) # Number of nodes in 'g' len_g = len(g) # Don't consider further splitting singleton communities # or communities of size lower than the threshold # or a community which has disconnected modules if ((len_g == 1) | (len_g <= cluster_size) | (not(nx.is_connected(g)))): if(not(nx.is_connected(g))): print("Warning: Check your data as an earliar iteration \ resulted in a cluster with \ internal disconnected components") continue else: # Create an array of community labels for the # nodes in 'cluster_num' c_sub = np.zeros(len_g, dtype=int) # indices of the nodes in 'sub_adj' sub_index = np.arange(len_g) # Determine the fiedler_vector of subgraph 'g' f_vector = fiedler_vector(g, weight='weight', normalized=normalize, tol=tolerence, method=evd_method, seed=seed) # Rearrange the nodes of 'sub_adj' in the descreasing order of # elements of fieldler vector nodeIds = [i for f_vector, i in sorted(zip(f_vector, sub_index), reverse=True)] # Initialize the communities corresponding to # bipartitioning of 'cluster_num' first_community = [] second_community = [] second_community.extend(nodeIds) # Records the splitting information split_info = {} # Create a copy of the latest cluster labels c_latest = c_new.copy() # Create a copy of 'dict_bool' dict_bool_copy = dict_bool.copy() # Possible splits of 'cluster_num' based on the fielder vector for j in range(len(nodeIds)-1): # Split the 'cluster_num' into two clusters first_community.append(nodeIds[j]) second_community.remove(nodeIds[j]) # Graph induced by nodes in 'first_community' g1 = g.subgraph(first_community) # Graph induced by nodes in 'second_community' g2 = g.subgraph(second_community) # Check if 'g1' and 'g2' are connected graphs each if(nx.is_connected(g1) & nx.is_connected(g2)): # Relabel the cluster labels of nodes in 'cluster_num' c_sub[first_community] = cluster_num new_label = max(c_new) + 1 c_sub[second_community] = new_label # Array of the union of connected clusters of the # split communities of 'cluster_num' conn_clusters = \ np.array(list(((dict_connected[cluster_num]) | set([cluster_num, new_label])))) # Update the cluster labels in 'c_latest' c_latest[bool_r] = c_sub # Update the boolean array of the split communities # of 'cluster_num' dict_bool_copy[cluster_num] = (c_latest == cluster_num) dict_bool_copy[new_label] = (c_latest == new_label) # Calculate the modularity density after # splitting 'cluster_num' div_metric = modularity_density(adj, c_latest, np.unique(c_sub[0:]), dict_bool_copy, conn_clusters) # Record the split split_info[div_metric] = j # Delete to save memory del c_latest del dict_bool_copy # Check if atleast one instance of splitting 'cluster_num' exists # that does not result in disconnected modules if len(split_info) > 0: # Split 'cluster_num' based on the division that # least compromizes modularity density best_split = split_info[max(split_info.keys())] c_sub[nodeIds[0:best_split+1]] = cluster_num c_sub[nodeIds[best_split+1:]] = max(c_new) + 1 # Update 'c_new' with new community labels as a # result of splitting 'cluster_num' c_new[bool_r] = c_sub else: print("No split possible for cluster num: {}, \ as any further split results in disconnected modules". format(cluster_num)) # Array of community labels, as a result of splitting, for the nodes # in the graph as ordered by the adjacency matrix return c_new
def constrained_merge_communities_qds(adj, c, cluster_size): """Merges the communities in graph if the merging improves modularity density, under the condition that the merging does not result in a community size greater than the threshold. Parameters ---------- adj : SciPy sparse matrix (csr or csc) The N x N Adjacency matrix of the graph. c : Integer array Current array of community labels for the nodes in the graph as ordered by the adjacency matrix. cluster_size : integer Threshold/maximum size (number of nodes) of a cluster. Returns ------- Integer array Array of community labels, as a result of merging, for the nodes in the graph as ordered by the adjacency matrix. """ # Array of unique community labels unique_clusters = np.unique(c) # Tracks the nodes in each community dict_bool = {} # Tracks the clusters that are connected to each community dict_connected = {} for label in unique_clusters: # Track the nodes in each community dict_bool[label] = (c == label) # Initialize each key to an empty set dict_connected[label] = set() # Records the merging information merging_info = {} # Tracks communities, which are connected to atleast one other community unique_clusters2 = [] for comm in unique_clusters: bool_1 = dict_bool[comm] zero = np.zeros(adj.shape[0], dtype=int) zero[~bool_1] = 1 # Check if the community 'comm' is connected to # atleast one other community and the community size is less than # the threshold if ((adj[bool_1].dot(zero)).sum() != 0) & \ (np.count_nonzero(bool_1) < cluster_size): # Record the community 'comm' unique_clusters2.append(comm) # Convert the list of community labels to array unique_clusters2 = np.array(unique_clusters2) # Track the clusters that are connected to each community for comm1 in unique_clusters2[:-1]: # index of the community 'comm1' i = np.where(unique_clusters2 == comm1)[0][0] bool_1 = dict_bool[comm1] adj_comm1 = adj[bool_1] # Track the clusters that are connected to community 'comm1' for comm2 in unique_clusters2[i+1:]: bool_2 = dict_bool[comm2] zero = np.zeros(len(c), dtype=int) zero[bool_2] = 1 # Check if 'comm2' is connected to 'comm1' if ((adj_comm1.dot(zero)).sum()) != 0: dict_connected[comm1].add(comm2) dict_connected[comm2].add(comm1) # Determine the contribution of each community to modularity density comm_metric = np.array([modularity_density(adj, c, [cluster_num], dict_bool, np.array(list(dict_connected[cluster_num]))) for cluster_num in unique_clusters2]) # Record the improvement in modularity density for # each pair of connected clusters for comm1 in unique_clusters2[:-1]: # Modularity density for community 'comm1' metric_1 = comm_metric[unique_clusters2 == comm1][0] # index of the community 'comm1' i = np.where(unique_clusters2 == comm1)[0][0] bool_1 = dict_bool[comm1] # Prospective merger communities of 'comm1' for comm2 in unique_clusters2[i+1:]: # boolean indices of 'comm2' bool_2 = dict_bool[comm2] # Consider merging only if 'comm2' is connected to 'comm1', and # the merging results in a cluster size less than # or equal to the threshold if (comm2 in dict_connected[comm1]) & \ (np.count_nonzero(bool_1 | bool_2) <= cluster_size): # Create a copy of cluster labels c_latest = c.copy() # Create a copy of 'dict_bool' dict_bool_copy = dict_bool.copy() # Modularity density value for community 'comm2' metric_2 = comm_metric[unique_clusters2 == comm2][0] # Label of the merged community merged_label = min(comm1, comm2) # Update the array of community labels to determine the # new value of the Modularity density (as a result of merging) c_latest[bool_1 | bool_2] = merged_label # Update the boolean array of the merged community dict_bool_copy[merged_label] = (bool_1 | bool_2) # Array of connected clusters of the new merged community conn_clusters = np.array(list(((dict_connected[comm1] | dict_connected[comm2]) - set([comm1, comm2])))) # Calculate the difference in modularity density for # merging 'comm1' and 'comm2' div_metric = modularity_density(adj, c_latest, np.array([merged_label]), dict_bool_copy, conn_clusters) - \ (metric_1 + metric_2) # Record the above merge only if it improves modularity density if div_metric > 0: merging_info[div_metric] = (comm1, comm2) # Delete to save memory del c_latest del dict_bool_copy # Tracks communities which have already merged comms_list = [] # Create a copy of cluster labels c_new = c.copy() # Check if atleast one instance of merging exists that # improves modularity density if (len(merging_info) > 0): # Sort the merging_info in the descending order of 'div_metric' for div_metric in sorted(merging_info.keys(), reverse=True): # Consider each pair of clusters, which improve # modularity density when merged (comm1, comm2) = merging_info[div_metric] # Check if 'comm1' or 'comm2' already exist in # the list of merged clusters if ((not(comm1 in comms_list)) & (not(comm2 in comms_list))): # Merge the pair of communities comms_list.extend([comm1, comm2]) # Label of the merged community c_new[dict_bool[comm1] | dict_bool[comm2]] = min(comm1, comm2) # Array of community labels, as a result of merging, # for the nodes in the graph as ordered by the adjacency matrix return c_new
def mapname(name): print(name) return name.lower() df = pd.read_csv('cc9_rel_undirected_nozeroes.csv') df = df.rename(mapper=mapname, axis='columns') print(df) G = nx.from_pandas_edgelist(df, edge_attr=['weight', 'change']) # G = nx.les_miserables_graph() G = nx.relabel.convert_node_labels_to_integers(G) print(G) adj = nx.to_scipy_sparse_matrix(G) for gr in nx.connected_component_subgraphs(G): # Nodes of the subgraph 'gr' nodes_gr = list(gr) print(nodes_gr) c = fine_tuned_clustering_q(G) print(c) Q = metrics.modularity_r(adj, c, np.unique(c), r=0) D = metrics.modularity_density(adj, c, np.unique(c)) print("Modularity Q:", Q) print("Modularity D:", D)