if line.endswith("\\\n"): # continuation line, buffer, goto next oldline = line.strip("\\\n") continue (headname, tails) = line.split(":") # head numfind = re.compile("^\d+") # re to find the number of this word head = numfind.findall(headname)[0] # get the number G.add_node(head) for tail in tails.split(): if head == tail: print("skipping self loop", head, tail, file=sys.stderr) G.add_edge(head, tail) return G if __name__ == '__main__': G = roget_graph() print("Loaded roget_dat.txt containing 1022 categories.") print("digraph has %d nodes with %d edges" % (nx.number_of_nodes(G), nx.number_of_edges(G))) UG = G.to_undirected() print(nx.number_connected_components(UG), "connected components") nx.draw_circular(UG) plt.show()
def _build_clusters_at_least_one_and_at_most_logn(self, graph): paths_for_diameter = nx.shortest_path_length(graph, weight='weight') for path in nx.shortest_path_length(graph, weight='weight'): print(path) ecc = nx.eccentricity(graph, sp=dict(paths_for_diameter)) diameter = nx.diameter(graph, e=ecc) print('The graph diameter is ', diameter) height_of_cluster = math.ceil(math.log(diameter, 2)) + 1 self._network.set_height_of_clusters(height_of_cluster) print('height_of_the hierarchy is ', height_of_cluster) # lowest level clusters print('lowest level clusters') self._network.add_cluster_level(0) # level 0 clusters for n in graph.nodes(): paths = nx.single_source_dijkstra_path_length(graph, n, 0, weight='weight') print(paths) cluster_graph = graph.subgraph([n]) cluster = Cluster('c' + str(n) + '_l' + '0', cluster_graph, 0, str(n)) self._network.add_cluster(0, cluster) # self._network.draw_cluster(cluster.cluster_id) # form upper level clusters for i in range(int(height_of_cluster)): self._network.add_cluster_level(i + 1) clustered_peers_list = [] # for naming the cluster properly cluster_ids_list = [] print ('AT LEVEL ----- ', i + 1) distance = pow(2, i + 1) print('THE DISTANCE LIMIT IS ', distance) clusterize = {} n = 0 # iterate over the peers once while n < self._peer_count: print('clustering peer ', n) paths_found = nx.single_source_dijkstra_path_length(graph, str(n), distance, weight='weight') peers_to_cluster = [] print('paths found in the level ', paths_found) tmp_peers_list_to_cluster = [] for peer in paths_found: peers_to_cluster.append(peer) # clustered_peers_list.append(peer) tmp_peers_list_to_cluster.append(peer) c_id = 'c' + str(n) + '_l' + str(i + 1) # for naming the clusters properly cluster_ids_list.append(c_id) cluster_ids_count = Counter(cluster_ids_list) c_id = c_id + "_" + str(cluster_ids_count[c_id]) temp_clustered_peers_list = copy.deepcopy(clustered_peers_list) temp_clustered_peers_list.extend(tmp_peers_list_to_cluster) duplicate = False if self.at_most_logn(Counter(temp_clustered_peers_list)) < 0: # check if duplicate peers or not for inner_key, inner_value in clusterize.items(): if Counter(inner_value) == Counter(tmp_peers_list_to_cluster): print ("FOUND DUPLICATE CLUSTER") duplicate = True break print("DOESN'T VIOLATE AT MOST LOG N") if not duplicate: clustered_peers_list.extend(tmp_peers_list_to_cluster) clusterize[c_id] = peers_to_cluster n += 1 print("CHECKING MEMBERSHIP") print(self.at_least_one(Counter(clustered_peers_list)) < 0) print(self.at_most_logn(Counter(clustered_peers_list)) < 0) assert (self.at_most_logn(Counter(clustered_peers_list)) < 0) # if not built yet build a cluster and remove peers who appear more than logn times missing_cluster_id = self.at_least_one(Counter(clustered_peers_list)) while missing_cluster_id > -1: print("PEER ", missing_cluster_id, " IS MISSING, BUILDING A CLUSTER AROUND IT.") paths_found = nx.single_source_dijkstra_path_length(graph, str(missing_cluster_id), distance, weight='weight') peers_to_cluster = [] print('paths found in the level ', paths_found) tmp_peers_list_to_cluster = [] for peer in paths_found: peers_to_cluster.append(peer) # clustered_peers_list.append(peer) tmp_peers_list_to_cluster.append(peer) # todo creating single node cluster tmp_peers_list_to_cluster = [str(missing_cluster_id)] c_id = 'c' + str(missing_cluster_id) + '_l' + str(i + 1) # for naming the clusters properly cluster_ids_list.append(c_id) cluster_ids_count = Counter(cluster_ids_list) c_id = c_id + "_" + str(cluster_ids_count[c_id]) temp_clustered_peers_list = copy.deepcopy(clustered_peers_list) temp_clustered_peers_list.extend(tmp_peers_list_to_cluster) duplicate = False if self.at_most_logn(Counter(temp_clustered_peers_list)) < 0: # check if duplicate peers or not for inner_key, inner_value in clusterize.items(): if Counter(inner_value) == Counter(tmp_peers_list_to_cluster): print("FOUND DUPLICATE CLUSTER") duplicate = True break print("DOESN'T VIOLATE AT MOST LOG N") if not duplicate: print("WHY HERE") clustered_peers_list.extend(tmp_peers_list_to_cluster) clusterize[c_id] = tmp_peers_list_to_cluster else: # find the peer that appears more than logn excess_cluster_id = self.at_most_logn(Counter(temp_clustered_peers_list)) while excess_cluster_id != -1: print(tmp_peers_list_to_cluster) print(excess_cluster_id) tmp_peers_list_to_cluster.remove(str(excess_cluster_id)) tmp_clustered_peers_list = copy.deepcopy(clustered_peers_list) tmp_clustered_peers_list.extend(tmp_peers_list_to_cluster) # removing print(excess_cluster_id, " APPEARS IN MORE THAN LOGN CLUSTER, REMOVING IT") excess_cluster_id = self.at_most_logn(Counter(tmp_clustered_peers_list)) print("PREPARING THE CLUSTER FROM MODIFIED PEERS LIST ", tmp_peers_list_to_cluster) clustered_peers_list.extend(tmp_peers_list_to_cluster) clusterize[c_id] = tmp_peers_list_to_cluster assert (self.at_most_logn(Counter(clustered_peers_list)) < 0) missing_cluster_id = self.at_least_one(Counter(clustered_peers_list)) print("ASSERTING") assert (self.at_least_one(Counter(clustered_peers_list)) < 0) assert (self.at_most_logn(Counter(clustered_peers_list)) < 0) print(clusterize) print(Counter(clustered_peers_list)) print("FINALLY ADDING CLUSTERS") for key in clusterize: print (len(clusterize[key])) print(nx.number_of_nodes(graph.subgraph([str(i) for i in clusterize[key]]))) assert(nx.number_of_nodes(graph.subgraph([str(i) for i in clusterize[key]])) == len(clusterize[key])) cluster_graph = graph.subgraph([str(i) for i in clusterize[key]]) cluster = Cluster(key, cluster_graph, i + 1) if i + 1 == height_of_cluster: cluster.root = True self._network._root_cluster = cluster self._network.add_cluster(i + 1, cluster) # self._network.draw_cluster(cluster.cluster_id) print("CLUSTERIZE ", clusterize) return
numfind = re.compile("^\d+") # re to find the number of this word head = numfind.findall(headname)[0] # get the number G.add_node(head) for tail in tails.split(): if head == tail: print("skipping self loop", head, tail, file=sys.stderr) G.add_edge(head, tail) return G if __name__ == '__main__': G = roget_graph() print("Loaded roget_dat.txt containing 1022 categories.") print("digraph has %d nodes with %d edges" % (nx.number_of_nodes(G), nx.number_of_edges(G))) UG = G.to_undirected() print(nx.number_connected_components(UG), "connected components") options = { 'node_color': 'black', 'node_size': 1, 'line_color': 'grey', 'linewidths': 0, 'width': 0.1, } nx.draw_circular(UG, **options) plt.show()
def _build_clusters_exactly_logn_membership(self, graph): paths_for_diameter = nx.shortest_path_length(graph, weight='weight') for path in nx.shortest_path_length(graph, weight='weight'): print(path) ecc = nx.eccentricity(graph, sp=dict(paths_for_diameter)) # ecc = nx.eccentricity(graph, sp=shortest_paths_for_diameter) diameter = nx.diameter(graph, e=ecc) # for path in paths_for_diameter: # print(path) print('The graph diameter is ', diameter) height_of_cluster = math.ceil(math.log(diameter, 2)) + 1 print('height_of_the hierarchy is ', height_of_cluster) # lowest level clusters print('lowest level clusters') self._network.add_cluster_level(0) for n in graph.nodes(): paths = nx.single_source_dijkstra_path_length(graph, n, 0, weight='weight') print(paths) cluster_graph = graph.subgraph([n]) cluster = Cluster('c' + str(n) + '_l' + '0', cluster_graph, 0) self._network.add_cluster(0, cluster) # self._network.draw_cluster(cluster.cluster_id) for i in range(int(height_of_cluster)): self._network.add_cluster_level(i + 1) clustered_peers_list = [] cluster_ids_list = [] print('AT LEVEL ------- ', i + 1) distance = pow(2, i + 1) print('THE DISTANCE LIMIT IS ', distance) clustered_peers = [] # for naming the clusters properly n = 0 while n < self._peer_count: incomplete = False # for n in graph.nodes(): print('clustering peer ', n) paths_found = nx.single_source_dijkstra_path_length(graph, str(n), distance, weight='weight') peers_to_cluster = [] print('paths found in the level ', paths_found) for peer in paths_found: clustered_peers.append(int(peer)) peers_to_cluster.append(int(peer)) clustered_peers_list.append(peer) cluster_graph = graph.subgraph([str(i) for i in peers_to_cluster]) c_id = 'c' + str(n) + '_l' + str(i + 1) cluster_ids_list.append(c_id) cluster_ids_count = Counter(cluster_ids_list) c_id = c_id + "_" + str(cluster_ids_count[c_id]) cluster = Cluster(c_id, cluster_graph, i + 1) self._network.add_cluster(i + 1, cluster) # self._network.draw_cluster(cluster.cluster_id) clustered_peers_count = Counter(clustered_peers_list) # print(self._network._clusters) for j in range(self._peer_count): if clustered_peers_count[str(j)] < math.log(self._peer_count, 2): incomplete = True if not incomplete: # make peers fall in exactly log(n) clusters for peer in range(self._peer_count): # count of peers in all clusters peer_count = clustered_peers_count[str(peer)] while peer_count > math.log(self._peer_count, 2): for cluster in reversed(self._network.clusters_by_level(i + 1)): if clustered_peers_count[str(peer)] > math.log(self._peer_count, 2) and cluster.graph.has_node( str(peer)): cluster.graph.remove_node(str(peer)) peer_count = peer_count - 1 if str(peer) in clustered_peers_list: clustered_peers_list.remove(str(peer)) clustered_peers_count = Counter(clustered_peers_list) break break n += 1 if n == self._peer_count: n = 0 # delete empty clusters for i in range(int(height_of_cluster)): for cluster in self._network.clusters_by_level(i): if nx.number_of_nodes(cluster.graph) == 0: self._network.remove_cluster_by_id(cluster.cluster_id, cluster.level) for i in range(int(height_of_cluster)): for cluster in self._network.clusters_by_level(i): assert (nx.number_of_nodes(cluster.graph) is not 0) # make sure that there are log(n) peers in every level for i in range(int(height_of_cluster) - 1): print("Verifying in level ", i + 1) clustered_peer_list = [] for cluster in self._network.clusters_by_level(i + 1): for node in cluster.graph.nodes: clustered_peer_list.append(str(node)) clustered_peer_count = Counter(clustered_peer_list) for cl in clustered_peer_count: assert (clustered_peer_count[cl] == math.log(self._peer_count, 2))