def evaluate_conductance(graph: nx.Graph, subgraphs, tau): """ :param graph: the graph being evaluated :param subgraphs: K cluster of Subsets of the main graph :param tau: tuning parameter, tau = 0 = vanilla conductance :return: core_cut, vanilla_conductance """ vanilla_conductances = [] core_cuts = [] for _, nodes in subgraphs.items(): subgraph = graph.subgraph(nodes).copy() subgraph_complement = set(graph) - set(subgraph) cut = nx.cut_size(graph, subgraph, subgraph_complement) volume_subgraph = nx.volume(graph, subgraph) volume_subgraph_complement = nx.volume(graph, subgraph_complement) volume_div = min(volume_subgraph, volume_subgraph_complement) vanilla_conductances.append((cut / volume_div)) core_cuts.append((cut + ((tau / len(graph)) * len(subgraph) * len(subgraph_complement))) / ( volume_div + (tau * len(subgraph)))) vanilla_conductance = min(vanilla_conductances) core_cut = min(core_cuts) logging.debug('Vanilla graph conductance: %f', vanilla_conductance) logging.debug('CoreCut graph conductance: %f', core_cut) return core_cut, vanilla_conductance
def self_entropy(G, T, alpha): """ self_entropy is a function that estimates the entropy of a node (with id alpha) in a codetree T of graph G param G: the given graph param T: a codetree of given graph G param alpha: the id of a node in T return: the self entropy of the node that with id alpha """ if T.get_node(alpha).is_root(): print("Error paramator in function self_entropy: alpha is illegel.") return 0 parent_id = T.get_node(alpha).bpointer leaf_ids_of_node = [] leaf_ids_of_parent = [] for node in T.leaves(alpha): leaf_ids_of_node.append(node.identifier) for node in T.leaves(parent_id): leaf_ids_of_parent.append(node.identifier) g_node = nx.cut_size( G, leaf_ids_of_node, list(set(nx.nodes(G)).difference(set(leaf_ids_of_node)))) v_G = nx.volume(G, nx.nodes(G)) v_node = nx.volume(G, leaf_ids_of_node) v_parent = nx.volume(G, leaf_ids_of_parent) entropy = -g_node / v_G * math.log2(v_node / v_parent) return entropy
def normalized_cut_size(G,c1,c2,weight=None): """Returns the normalized cut size between two containers of nodes c1 and c2. The normalized cut size is defined as the cut size times the sum of the reciprocal sizes of the volumes of the two cuts.[1] Parameters ---------- G : NetworkX Graph c1, c2 : container containsers of nodes weight : keyword, optional default=None keyword for weight on edges Returns ------- normalized_cut_size : float See Also -------- cut_size volume conductance expansion References ---------- ..[1] David Gleich. 'Heirarchicical Directed Spectral Graph Partitioning'. Website report. http://www.stanford.edu/~dgleich/publications/directed-spectral.pdf """ return nx.cut_size(G,c1,c2,weight)*(1./nx.volume(c1,weight) + \ 1./nx.volume(c2,weight))
def prefilter_true_variant(G): all_nodes = set(G.nodes()) volumes = [ nx.volume(G, {w}, "weight") / nx.volume(G, {w}) for w in all_nodes ] median_volume = np.median(volumes) predicted_true_variants = np.where(volumes < median_volume)[0] return (predicted_true_variants)
def compute_vanilla_sc(G, resf, debug=False): D = np.array( [1 / math.sqrt(val) for val in [G.degree()[i] for i in G.nodes()]]) L = nx.normalized_laplacian_matrix(G) start_time = time.time() vals, vecs = sp_linalg.eigsh(L, k=8) print("vals", vals) elapsed = time.time() - start_time print("train vanilla time", elapsed, file=resf) dict['van_running_time'] = elapsed vecs = vecs[:, 1] # second eigenvector y_vecs = D * vecs #yns = [ [y_vecs[i], ]] i = 0 yns = [] for n in G.nodes(): yns += [[y_vecs[i], n]] i += 1 yns = sorted(yns, key=lambda tup: tup[0]) total_seq = [el[1] for el in yns] random.shuffle(total_seq) min_conduct = 1 for i in tqdm(range(len(y_vecs) - 1), mininterval=10, leave=False, desc=' - (Vanilla) '): seq = total_seq[:(i + 1)] rest_seq = total_seq[(i + 1):] if (nx.volume(G, seq) != 0 and nx.volume(G, rest_seq) != 0): conduct = nx.algorithms.conductance(G=G, S=seq) #print("conduct", conduct) if conduct < min_conduct: min_cardinal = min(len(seq), len(rest_seq)) min_conduct = conduct min_seq = seq.copy() else: print("Volume is 0") print("seq", seq) print("train vanilla min_conduct", min_conduct, file=resf) print("train vanilla min_cardinal", min_cardinal, file=resf) dict['van_balance'] = min_cardinal return min_conduct, min_seq
def calc_sum_conductance(column_name): sum_conductance = 0 unique_cluster_names = list(set(df_with_clusters_classified[column_name])) for cluster_name in unique_cluster_names: S = set(df_with_clusters_classified[ df_with_clusters_classified[column_name] == cluster_name] ['node ID'].values) try: conductance = (nx.cut_size(G, S) / min(nx.volume(G, S), nx.volume(G, nodes_G - S))) except ZeroDivisionError: continue sum_conductance += conductance return sum_conductance
def getCommuteDistace(G): """ Returns the matrix of commute distance """ verts = list(G.nodes) n = len(verts) vol = nx.volume(G,verts) #get adj matrix A = nx.adjacency_matrix(G) A = A.todense() # use NetworkX to get Laplacian L = nx.laplacian_matrix(G) L = L.todense() Gamma = L + (1/n) * np.ones([n,n]) CM = np.zeros([n,n]) #get Moore-Penrose pseudo inverse Gamma_pinv = np.linalg.pinv(Gamma, rcond=1e-4) # for i in tqdm(range(n-1)) : for i in range(n): for j in range(i+1,n): CM[i,j] = vol*(Gamma_pinv[i,i] + Gamma_pinv[j,j] - 2 * Gamma_pinv[i,j]) CM[j,i] = CM[i,j] return CM
def score_conductance(nodes,graph): weight="weight" if len(nodes)<4: return 0 #optim if len(graph.edges)==0: return 0 nodes_in_graph = nodes & set(graph.nodes()) if len(nodes_in_graph)<4: return 0 total_degree = nx.volume(graph, nodes_in_graph) threashold = np.sqrt(len(nodes_in_graph)) if total_degree/len(nodes_in_graph)<threashold: return 0 subgraph = nx.subgraph(graph, nodes_in_graph) avg_deg = np.average([val for (node, val) in subgraph.degree()]) if avg_deg<np.sqrt(len(nodes_in_graph)): return 0 try: inverse_cond = inverse_conductance(graph,nodes_in_graph,total_degree) return inverse_cond except: return 0
def inverse_conductance(G, S): weight = "weight" T = set(G) - set(S) num_cut_edges = nx.cut_size(G, S, T, weight=weight) volume_S = nx.volume(G, S, weight=weight) if len(T ) == 0: #if all nodes in the commmunity, bad conductance (avoid /0) return 0 volume_T = nx.volume(G, T, weight=weight) volume_T = volume_T + len( T ) #If only a few nodes outside the community, poor score (trivial solution), #but if many nodes outside the community, return good score. And avoid /0 return 1 - num_cut_edges / min(volume_T, volume_S)
def getAmp(G): """ Returns the matrix of amplified commute distance """ verts = list(G.nodes) n = len(verts) vol = nx.volume(G,verts) # use NetworkX to get Laplacian L = nx.laplacian_matrix(G) L = L.todense() C_AMP = np.zeros([n,n]) #get Moore-Penrose pseudo inverse L_MP_pseudo_inv = np.linalg.pinv(L, rcond=1e-5, hermitian=True) for i in tqdm(range(n-1)) : e_i = np.zeros([n,1]) e_i[i, 0] = 1 for j in range(i+1, n): e_j = np.zeros([n,1]) e_j[j, 0] = 1 a = (e_i - e_j) b = L_MP_pseudo_inv @ a c_ij = ( vol * (np.transpose(a) @ b) )[0,0] c_ij_amp = (c_ij / vol) - (1/G.degree(verts[i])) - (1/G.degree(verts[j])) + ( 2/( G.degree(verts[i]) *G.degree(verts[j]) ) ) C_AMP[i,j] = c_ij_amp C_AMP[j,i] = c_ij_amp return C_AMP
def calc_sum_ncut(column_name): sum_ncut = 0 unique_cluster_names = list(set(df_with_clusters_classified[column_name])) for cluster_name in unique_cluster_names: S = set(df_with_clusters_classified[ df_with_clusters_classified[column_name] == cluster_name] ['node ID'].values) try: ncut = (nx.cut_size(G, S) / nx.volume(G, S)) except ZeroDivisionError: continue sum_ncut += ncut return sum_ncut
def get_corecut(G, S, tau, n): vol = nx.volume(G, S) cut = nx.cut_size(G, S) s_size = len(S) sc_size = n - s_size up = cut + (tau / n) * s_size * sc_size down = vol + tau * s_size if down == 0: print("cut: {cut} , up: {up}, vol: {vol}, down: {down} ". format(cut=cut, up=up, vol=vol, down=down)) return 1 else: return up / down
def shannon_entropy(G): """ shannon_entropy is a function that estimates the 1-dimensional strutural entropy (i.e. Shonnon entropy) of graph G param G: graph return: the entropy and the structure None. note: defined by Angsheng Li and Yichen Pan in [1], definition 1. [1] Angsheng Li, Yicheng Pan: Structural Information and Dynamical Complexity of Networks. IEEE Trans. Information Theory 62(6): 3290-3339 (2016) """ entropy = 0 vol = nx.volume(G, nx.nodes(G)) for node in list(nx.nodes(G)): p = nx.degree(G, node) / vol entropy += -p * math.log2(p) return entropy, None
def compute_cond(G, normalized): indices = list(enumerate(normalized)) indices = sorted(indices, key = lambda x : x[1], reverse = True) B = set(list(G.nodes)) A = set() min_cond = sys.maxsize min_set = None volA = 0 volB = nx.volume(G, G) cross = 0 prev = None for tup in indices: vertex = tup[0] val = tup[1] if prev == None: prev = val else: if prev == val: A.add(vertex) B.remove(vertex) if len(B) == 0: break d = nx.degree(G, vertex) volA += d volB -= d vol = min(volA, volB) for u in G.neighbors(vertex): if u in A: cross -= 1 else: cross += 1 else: A.add(vertex) B.remove(vertex) if len(B) == 0: break d = nx.degree(G, vertex) volA += d volB -= d vol = min(volA, volB) for u in G.neighbors(vertex): if u in A: cross -= 1 else: cross += 1 cond = cross / vol if cond < min_cond: min_cond = cond min_set = A.copy() prev = val return (min_cond, min_set)
def lovaszSimonovits(myp,G,data): supp=torch.nonzero(myp).squeeze().tolist() degs=data.adj[supp,:].sum(-1) sortedsupp=torch.argsort(myp[supp]/degs,descending=True).squeeze().tolist() support = [supp[i] for i in sortedsupp] vols = [] sweepset=[] probmass=[] for i in support: sweepset+= [i] vols +=[nx.volume(G,sweepset)] probmass+=[myp[sweepset].sum()] plt.plot(vols,probmass) return probmass,vols
def sweep(myp,G,data): supp = torch.nonzero(myp).squeeze().tolist() degs = data.adj[supp,:].sum(-1) sortedsupp = torch.argsort(myp[supp]/degs,descending=True).squeeze().tolist() support = [supp[i] for i in sortedsupp] sweepset = [] bestconductance = 1000 bestvolume = 0 bestset = [] for i in support: sweepset += [i] volume = nx.volume(G,sweepset) conductance = nx.conductance(G,sweepset) if(conductance < bestconductance): bestconductance = conductance bestvolume = volume bestset = sweepset return bestset, bestconductance, bestvolume
def compute_regularised_sc(G, resf, debug=False): degrees = [val for (node, val) in G.degree()] sum_deg = sum(degrees) n = len(degrees) tau = sum_deg / n A = (nx.to_scipy_sparse_matrix(G)).astype(float) indices = [i for i in range(n)] row = np.array(indices) col = row data = [1 / math.sqrt(d + tau) for d in degrees] D = csr_matrix((data, (row, col)), shape=(n, n)) #degree matrix id_data = np.ones(n) I = csr_matrix((id_data, (row, col)), shape=(n, n)) #identity matrix L = I - (D @ A) @ D start_time = time.time() vals, vecs = sp_linalg.eigsh(L, k=6) elapsed = time.time() - start_time print("train regularised time", elapsed, file=resf) dict['reg_running_time'] = elapsed vecs = vecs[:, 1] # second eigenvector y_vecs = D * vecs i = 0 yns = [] for node in G.nodes(): yns += [[y_vecs[i], node]] i += 1 yns = sorted(yns, key=lambda tup: tup[0]) total_seq = [el[1] for el in yns] min_corecut = 1 for i in tqdm(range(len(y_vecs) - 1), mininterval=3, leave=False, desc=' - (Regularised) '): seq = total_seq[:(i + 1)] rest_seq = total_seq[(i + 1):] if (nx.volume(G, seq) < nx.volume(G, rest_seq)): corecut = get_corecut(G=G, S=seq, tau=tau, n=n) if corecut < min_corecut: min_cardinal = min(len(seq), len(rest_seq)) min_corecut = corecut min_seq = seq.copy() else: corecut = get_corecut(G=G, S=rest_seq, tau=tau, n=n) if corecut < min_corecut: min_cardinal = min(len(seq), len(rest_seq)) min_corecut = corecut min_seq = rest_seq.copy() print("train regularised min_corecut", min_corecut, file=resf) print("train regularised min_cardinal", min_cardinal, file=resf) dict['reg_balance'] = min_cardinal return min_corecut, min_seq
def test_multidigraph(self): edges = [(0, 1), (1, 2), (2, 3), (3, 0)] G = nx.MultiDiGraph(edges * 2) assert_equal(nx.volume(G, set([0, 1])), 4)
print("Scores for Cut Distance: ", cutDistance[a]) print("\n") # Modularity Score print("Modularity: ", ModScore) print("\n") modularityEnsemble.update({cutDistance[a]: ModScore}) # Conductance Score sumOfCond = [] for i in range(len(clusterL)): if clusterL[i] == []: # catch Division by zero continue if nx.volume(original_graph, clusterL[i]) == 0: continue # calculate Cond for current Cluster in list of Clusters currentCond = conductance(original_graph, clusterL[i]) sumOfCond.append(currentCond) print("Conductance for: ", clusterL[i], " = ", currentCond) overallCond = min(sumOfCond) print("Overall Conductance: ", overallCond) print("\n") conductanceEnsemble.update({cutDistance[a]: overallCond}) # edge betweenness centrality Score print("Edge Betweenness Centrality Score: ", averageEdge) print("Edge Betweenness Centrality Score: ", totalEdge)
def test_multidigraph(self): edges = [(0, 1), (1, 2), (2, 3), (3, 0)] G = nx.MultiDiGraph(edges * 2) assert nx.volume(G, {0, 1}) == 4
def test_barbell(self): G = nx.barbell_graph(3, 0) assert nx.volume(G, {0, 1, 2}) == 7 assert nx.volume(G, {3, 4, 5}) == 7
def test_digraph(self): G = nx.DiGraph([(0, 1), (1, 2), (2, 3), (3, 0)]) assert nx.volume(G, {0, 1}) == 2
def test_multigraph(self): edges = list(nx.cycle_graph(4).edges()) G = nx.MultiGraph(edges * 2) assert nx.volume(G, {0, 1}) == 8
def test_graph(self): G = nx.cycle_graph(4) assert_equal(nx.volume(G, set([0, 1])), 4)
def test_graph(self): G = nx.cycle_graph(4) assert_equal(nx.volume(G, {0, 1}), 4)
def test_digraph(self): G = nx.DiGraph([(0, 1), (1, 2), (2, 3), (3, 0)]) assert_equal(nx.volume(G, {0, 1}), 2)
def test_multigraph(self): edges = list(nx.cycle_graph(4).edges()) G = nx.MultiGraph(edges * 2) assert_equal(nx.volume(G, {0, 1}), 8)
def test_multidigraph(self): edges = [(0, 1), (1, 2), (2, 3), (3, 0)] G = nx.MultiDiGraph(edges * 2) assert_equal(nx.volume(G, {0, 1}), 4)
def test_multigraph(self): edges = list(nx.cycle_graph(4).edges()) G = nx.MultiGraph(edges * 2) assert_equal(nx.volume(G, set([0, 1])), 8)
def test_graph(self): G = nx.cycle_graph(4) assert nx.volume(G, {0, 1}) == 4
print("mynew modscore: ", myNewModScore) print("\n") print("My Modularity: ", myModScore) print("\n") modularityEnsemble.update({cutDistance[a]: ModScore}) modY.append(ModScore) # Conductance Score sumOfCond = [] for i in range(len(clusterL)): if clusterL[i] == []: # catch Division by zero continue if nx.volume(original_graph, clusterL[i]) == 0: continue # calculate Cond for current Cluster in list of Clusters currentDist = cutDistance[a] cutCounter = 0 for c in Z[:, 2]: if currentDist < c: cutCounter = cutCounter + 1 currentCond = cutCounter / nx.volume(original_graph, clusterL[i]) # currentCond = conductance(original_graph,clusterL[i]) sumOfCond.append(currentCond) print("Conductance for: ", clusterL[i], " = ", currentCond, "cut-size: ", cutCounter) overallCond = min(sumOfCond)
print("Precision@{}: {}".format(k, len(right_set)/k)) print("Recall@{}: {}".format(k, len(right_set)/len(labelList))) if outPath: f_r = open('degree_top{}'.format(k), 'w') for t in top_k_dict.items(): f_r.write(t[0]+'\t'+str(t[1])+'\n') f_r.close() if __name__ == "__main__": args = parse_args() g = Graph() start_time = time.time() if args.graph_format == 'adjlist': g.read_adjlist(filename=args.input) elif args.graph_format == 'edgelist': g.read_edgelist(filename=args.input, weighted=args.weighted, directed=args.directed) cluster = read_cluster(args.cluster) print("File read done, elapsed time {}s".format(time.time()-start_time)) print("Node Size: {}".format(g.G.number_of_nodes())) print("Edge Size: {}".format(g.G.number_of_edges())) start_time = time.time() print("S size {}, volume {}".format(len(cluster[0]), nx.volume(g.G, cluster[0], weight='weight'))) print("T size {}, volume {}".format(len(cluster[1]), nx.volume(g.G, cluster[1], weight='weight'))) cond = nx.conductance(g.G, cluster[0], cluster[1], weight='weight') print("Conductance {}, elapsed time {}s".format(cond, time.time()-start_time))
def test_digraph(self): G = nx.DiGraph([(0, 1), (1, 2), (2, 3), (3, 0)]) assert_equal(nx.volume(G, set([0, 1])), 2)