def test_five_clique_ring(): test = Graph() # c1 test.add_edge('1a', '1b') test.add_edge('1a', '1c') test.add_edge('1a', '1d') test.add_edge('1b', '1c') test.add_edge('1b', '1d') test.add_edge('1c', '1d') # c2 test.add_edge('2a', '2b') test.add_edge('2a', '2c') test.add_edge('2a', '2d') test.add_edge('2b', '2c') test.add_edge('2b', '2d') test.add_edge('2c', '2d') # c3 test.add_edge('3a', '3b') test.add_edge('3a', '3c') test.add_edge('3a', '3d') test.add_edge('3b', '3c') test.add_edge('3b', '3d') test.add_edge('3c', '3d') # c4 test.add_edge('4a', '4b') test.add_edge('4a', '4c') test.add_edge('4a', '4d') test.add_edge('4b', '4c') test.add_edge('4b', '4d') test.add_edge('4c', '4d') # c5 test.add_edge('5a', '5b') test.add_edge('5a', '5c') test.add_edge('5a', '5d') test.add_edge('5b', '5c') test.add_edge('5b', '5d') test.add_edge('5c', '5d') # connections test.add_edge('1a', '2c') test.add_edge('2a', '3c') test.add_edge('3a', '4c') test.add_edge('4a', '5c') test.add_edge('5a', '1c') # ground truth ground_truth = set([frozenset(['1a', '1b', '1c', '1d']), frozenset(['2a', '2b', '2c', '2d']), frozenset(['3a', '3b', '3c', '3d']), frozenset(['4a', '4b', '4c', '4d']), frozenset(['5a', '5b', '5c', '5d'])]) communities = asyn_fluidc(test, 5, seed=9) result = {frozenset(c) for c in communities} assert result == ground_truth
def test_two_nodes(): test = Graph() test.add_edge("a", "b") # ground truth ground_truth = {frozenset(["a"]), frozenset(["b"])} communities = asyn_fluidc(test, 2) result = {frozenset(c) for c in communities} assert result == ground_truth
def test_single_node(): test = Graph() test.add_node("a") # ground truth ground_truth = {frozenset(["a"])} communities = asyn_fluidc(test, 1) result = {frozenset(c) for c in communities} assert result == ground_truth
def test_two_nodes(): test = Graph() test.add_edge('a', 'b') # ground truth ground_truth = set([frozenset(['a']), frozenset(['b'])]) communities = asyn_fluidc(test, 2) result = {frozenset(c) for c in communities} assert result == ground_truth
def algorithm_asyn_fluidc(G, gt_communities_count): """ Async Fluidc community detection algorithm Parés F., Garcia-Gasulla D. et al. “Fluid Communities: A Competitive and Highly Scalable Community Detection Algorithm”. https://arxiv.org/pdf/1703.09307.pdf """ communities = [ list(community) for community in asyn_fluidc( G, gt_communities_count, max_iter=100, seed=None) ] return communities
def get_benchmark_amis(G,gt): # Louvain louv = community.best_partition(G) louvc = [] for idx,val in louv.items(): louvc.append(val) louv_ami = metrics.adjusted_mutual_info_score(gt,louvc) # Fluid communities fluid = asyn_fluidc(G,2) list_nodes = [set(c) for c in fluid] est_idx = np.zeros((nx.number_of_nodes(G),)) for i in range(len(list_nodes)): for idx in list_nodes[i]: est_idx[idx] = i fluid_ami = metrics.adjusted_mutual_info_score(gt,est_idx) # FastGreedy list_nodes = list(greedy_modularity_communities(G)) est_idx = np.zeros((nx.number_of_nodes(G),)) for i in range(len(list_nodes)): for idx in list_nodes[i]: est_idx[idx] = i fg_ami = metrics.adjusted_mutual_info_score(gt,est_idx) # Infomap im = Infomap() for node in G.nodes: im.add_node(node) for edge in G.edges: im.add_link(edge[0], edge[1]) im.add_link(edge[1],edge[0]) # Run the Infomap search algorithm to find optimal modules im.run() # print(f"Found {im.num_top_modules} modules with Infomap") est_idx = np.zeros((nx.number_of_nodes(G),)) for node in im.tree: if node.is_leaf: est_idx[node.node_id] = node.module_id im_ami = metrics.adjusted_mutual_info_score(gt,est_idx) benchmark = {'Louvain':louv_ami, 'Fluid':fluid_ami, 'FastGreedy':fg_ami, 'Infomap':im_ami} return benchmark
def add_edges_between_centroids(self, edge_types, num_centroids, rng, attribute_name='distance'): """ This algorithm creates edges between central nodes. These central nodes are determined by a clustering algorithm (here the Fluid Communities algorithm). :param iter edge_types: Types of the edges to add. :param int num_centroids: Number of centroids. :param rng: Random number generator. :type rng: :py:class:`RandomGenerator<city_graph.utils.RandomGenerator>` """ print("[Topology] Starting building edges between %s central nodes." % num_centroids) # Calculate clusters # TODO: I think it would make sense to instead use e.g. a k-means for two reasons: # * this algo assumes that the clusters have the same density, which is not necessarily # application to cities (some areas are more crowded) # * the implementation needs the graph to be fully connected to begin with. It seems # to be a limitation clusters = (list(c) for c in asyn_fluidc( self.graph, k=num_centroids, seed=rng.rand_int())) # Extract centroids: we take the node with the highest degree centroids = [c[int(np.argmax([self.graph.degree[n] for n in c]))] for c in clusters] # Create temporary graph for the centroids tmp_graph = Graph() tmp_graph.add_nodes_from(centroids) # We need the combinations because the graph is undirected and we dont want self-edges for n1, n2 in combinations(centroids, 2): tmp_graph.add_edge(n1, n2, **{attribute_name: self.distance(n1, n2)}) # Calculate subgraph with the minimum sum of edge weights # TODO: to investigate why we do this here... subgraph = minimum_spanning_tree(tmp_graph, weight=attribute_name) # Build edges # Here we can reuse the previously calculated distances old_num_edges = self.num_of_edges for (n1, n2) in subgraph.edges: for edge_type in edge_types: # TODO: exception might be raised here because we now check # That there is no outgoing/incoming edges. # Should we fix when we use an actual triangular matrix with suppress(RuntimeError): self.add_edge(n1, n2, edge_type, **subgraph[n1][n2]) # Inform that edges have been built print("[Topology] %i edges have been created" % (self.num_of_edges - old_num_edges))
def apply(frequents_label, frequents_encodings, parameters=None): """ Apply a clustering algorithm (modularity maximization) on the encodings Parameters --------------- frequents_label Label of the sequences frequents_encodings Encodings of the sequences parameters Parameters of the algorithm: nc => numbers of clusters p1 => weight of the first term p2 => weight of the second term Returns ---------------- communities Communities """ if parameters is None: parameters = {} nc = parameters[NC] if NC in parameters else DEFAULT_NC p1 = parameters[P1] if P1 in parameters else DEFAULT_P1 p2 = parameters[P2] if P2 in parameters else DEFAULT_P2 G = nx.Graph() for i in range(len(frequents_encodings)): G.add_node(i) for i in range(len(frequents_encodings)): for j in range(i + 1, len(frequents_encodings)): sim1 = np.linalg.norm(frequents_encodings[i] - frequents_encodings[j]) as1 = set(frequents_label[i].split()) as2 = set(frequents_label[j].split()) sim2 = len(as1.intersection(as2)) / len(as1.union(as2)) G.add_edge(i, j, weight=p1 * sim1 + p2 * sim2) communities = list(asyn_fluidc(G, nc)) return communities
def test_two_clique_communities(): test = Graph() # c1 test.add_edge("a", "b") test.add_edge("a", "c") test.add_edge("b", "c") # connection test.add_edge("c", "d") # c2 test.add_edge("d", "e") test.add_edge("d", "f") test.add_edge("f", "e") # ground truth ground_truth = {frozenset(["a", "c", "b"]), frozenset(["e", "d", "f"])} communities = asyn_fluidc(test, 2, seed=7) result = {frozenset(c) for c in communities} assert result == ground_truth
def test_two_clique_communities(): test = Graph() # c1 test.add_edge('a', 'b') test.add_edge('a', 'c') test.add_edge('b', 'c') # connection test.add_edge('c', 'd') # c2 test.add_edge('d', 'e') test.add_edge('d', 'f') test.add_edge('f', 'e') # ground truth ground_truth = {frozenset(['a', 'c', 'b']), frozenset(['e', 'd', 'f'])} communities = asyn_fluidc(test, 2, seed=7) result = {frozenset(c) for c in communities} assert result == ground_truth
return mutual_info ########################################################### ########################################################### # Method: Fluid communities ########################################################### # Raw data if not nx.is_connected(G): #print('---Fluid community requires connected graph, skipping raw version---') scores['fluid-raw'] = 'failed' runtimes['fluid-raw'] = 'failed' else: time_s = time.time() comp = asyn_fluidc(G.to_undirected(), k=num_partitions) list_nodes = [frozenset(c) for c in comp] est_idx = np.zeros((num_nodes, )) for i in range(len(list_nodes)): for idx in list_nodes[i]: est_idx[idx] = i runtime = time.time() - time_s mutual_info = metrics.adjusted_mutual_info_score(database['labels'], est_idx) scores['fluid-raw'] = mutual_info runtimes['fluid-raw'] = runtime # Noisy data if not nx.is_connected(nG): print( '---Fluid community requires connected graph, skipping noisy version---'
def test_five_clique_ring(): test = Graph() # c1 test.add_edge("1a", "1b") test.add_edge("1a", "1c") test.add_edge("1a", "1d") test.add_edge("1b", "1c") test.add_edge("1b", "1d") test.add_edge("1c", "1d") # c2 test.add_edge("2a", "2b") test.add_edge("2a", "2c") test.add_edge("2a", "2d") test.add_edge("2b", "2c") test.add_edge("2b", "2d") test.add_edge("2c", "2d") # c3 test.add_edge("3a", "3b") test.add_edge("3a", "3c") test.add_edge("3a", "3d") test.add_edge("3b", "3c") test.add_edge("3b", "3d") test.add_edge("3c", "3d") # c4 test.add_edge("4a", "4b") test.add_edge("4a", "4c") test.add_edge("4a", "4d") test.add_edge("4b", "4c") test.add_edge("4b", "4d") test.add_edge("4c", "4d") # c5 test.add_edge("5a", "5b") test.add_edge("5a", "5c") test.add_edge("5a", "5d") test.add_edge("5b", "5c") test.add_edge("5b", "5d") test.add_edge("5c", "5d") # connections test.add_edge("1a", "2c") test.add_edge("2a", "3c") test.add_edge("3a", "4c") test.add_edge("4a", "5c") test.add_edge("5a", "1c") # ground truth ground_truth = { frozenset(["1a", "1b", "1c", "1d"]), frozenset(["2a", "2b", "2c", "2d"]), frozenset(["3a", "3b", "3c", "3d"]), frozenset(["4a", "4b", "4c", "4d"]), frozenset(["5a", "5b", "5c", "5d"]), } communities = asyn_fluidc(test, 5, seed=9) result = {frozenset(c) for c in communities} assert result == ground_truth
def fluid_community(self, k=2): """ Returns communities in G as detected by Fluid Communities algorithm. """ undirected_g = self.G.to_undirected() return list(asyn_fluid.asyn_fluidc(undirected_g, k))
def five_clique_ring(): """Not auto-tested (not named test_...) due to cross-version seed issues python3.4 in particular gives different results. """ test = Graph() # c1 test.add_edge('1a', '1b') test.add_edge('1a', '1c') test.add_edge('1a', '1d') test.add_edge('1b', '1c') test.add_edge('1b', '1d') test.add_edge('1c', '1d') # c2 test.add_edge('2a', '2b') test.add_edge('2a', '2c') test.add_edge('2a', '2d') test.add_edge('2b', '2c') test.add_edge('2b', '2d') test.add_edge('2c', '2d') # c3 test.add_edge('3a', '3b') test.add_edge('3a', '3c') test.add_edge('3a', '3d') test.add_edge('3b', '3c') test.add_edge('3b', '3d') test.add_edge('3c', '3d') # c4 test.add_edge('4a', '4b') test.add_edge('4a', '4c') test.add_edge('4a', '4d') test.add_edge('4b', '4c') test.add_edge('4b', '4d') test.add_edge('4c', '4d') # c5 test.add_edge('5a', '5b') test.add_edge('5a', '5c') test.add_edge('5a', '5d') test.add_edge('5b', '5c') test.add_edge('5b', '5d') test.add_edge('5c', '5d') # connections test.add_edge('1a', '2c') test.add_edge('2a', '3c') test.add_edge('3a', '4c') test.add_edge('4a', '5c') test.add_edge('5a', '1c') # ground truth ground_truth = set([frozenset(['1a', '1b', '1c', '1d']), frozenset(['2a', '2b', '2c', '2d']), frozenset(['3a', '3b', '3c', '3d']), frozenset(['4a', '4b', '4c', '4d']), frozenset(['5a', '5b', '5c', '5d'])]) communities = asyn_fluidc(test, 5, seed=9) result = {frozenset(c) for c in communities} assert result == ground_truth
# baseline 2: FastGreedy, Clauset-Newman-Moore greedy modularity maximization time_s = time.time() list_nodes = list(greedy_modularity_communities(G.to_undirected())) est_idx = np.zeros((num_nodes, )) for i in range(len(list_nodes)): for idx in list_nodes[i]: est_idx[idx] = i mutual_info[1, pn, tn] = metrics.adjusted_mutual_info_score(gt, est_idx) runtime[1, pn, tn] = time.time() - time_s print('-- {}: runtime={:.4f}sec, mutual information={:.4f}.'.format( methods[1], runtime[1, pn, tn], mutual_info[1, pn, tn])) # baseline 4: Fluid Communities algorithm. time_s = time.time() comp = asyn_fluidc(G.to_undirected(), k=est_number) list_nodes = [frozenset(c) for c in comp] est_idx = np.zeros((num_nodes, )) for i in range(len(list_nodes)): for idx in list_nodes[i]: est_idx[idx] = i mutual_info[3, pn, tn] = metrics.adjusted_mutual_info_score(gt, est_idx) runtime[3, pn, tn] = time.time() - time_s print('-- {}: runtime={:.4f}sec, mutual information={:.4f}.'.format( methods[3], runtime[3, pn, tn], mutual_info[3, pn, tn])) ot_dict = { 'loss_type': 'L2', # the key hyperparameters of GW distance 'ot_method': 'proximal', 'beta': 0.15,
def partition_featurize_graph_fpdwl(G,k=100,dims=64,wl_steps=1, distribution_offset=0,distribution_exponent=0): """ Partition+Anchor a graph using Fluid communities+Pagerank and produce node features using Degree+WL (Hence fpdwl) ----------- Parameters: G : NetworkX graph k : number of blocks in partition dims : dimension of feature space wl_steps : number of Weisfeiler-Lehman aggregations to carry out ------- Returns: p : dict with keys=node labels and values=probabilities on nodes partition : list of sets containing node labels node_subset : list of anchor node labels dists : distances between anchors features : degree+WL based node features """ pr = pagerank(G) # Partition graph via Fluid partition_iter = asyn_fluidc(G,k) partition = [] for i in partition_iter: partition.append(i) # Create anchors via PageRank anchors = [] for p in partition: part_pr = {} for s in p: part_pr[s] = pr[s] anchors.append(max(part_pr, key=part_pr.get)) anchors = sorted(anchors) # Fix an ordering on anchors # Featurize using degrees and Weisfeiler-Lehman degrees = dict(nx.degree(G)) # One-hot encoding of degrees for key in degrees.keys(): deg = degrees[key] feat = np.zeros(dims) if deg < dims: feat[deg]+=1 #Create one-hot encoding degrees[key] = feat #Replace scalar degree with one-hot vector for i in range(wl_steps): degrees = wl_label(G,degrees) # Rename, obtain sorted node names and features features = degrees a,b = list(zip(*sorted(features.items()))) nodes = list(a) features = np.array(b) # Obtain probability vector p = np.array([(G.degree(n)+distribution_offset)**distribution_exponent for n in nodes]) p = p/np.sum(p) # Rename anything else node_subset = anchors node_subset_idx = [nodes.index(v) for v in node_subset] #indices of anchor nodes in node list return nodes, features, p, partition, node_subset, node_subset_idx