def create_networks(self): print "creating networks" # WORDS print "-"*10+" Words" words_graph= self.create_network(self.words, self.words_to_words, self.limit_words, 1) if words_graph.order() != 0: self.words_allowed=[self.words[int(w)] for w in words_graph.nodes()] print "%d words_allowed"%len(self.words_allowed) self.words_communities = community.best_partition(words_graph.to_undirected()) print "Number of words partitions : ", len(set(self.words_communities.values())) # CITATIONS print print "-"*10+" Citations" citations_graph= self.create_network(self.cited, self.citations, self.limit_citations, 0) if citations_graph.order() != 0: self.cited_allowed=[self.cited[int(w)] for w in citations_graph.nodes()] print "%d cited_allowed"%len(self.cited_allowed) # Communities self.citations_communities = community.best_partition(citations_graph.to_undirected()) print "Number of citations partitions : ", len(set(self.citations_communities.values()))
def louvain(graph): """Computes clusters using the Louvain algorithm Parameters ---------- graph : A NetworkX Graph to cluster. This object will also be modified. Nodes will gain a new attribute called 'cluster' indicating which cluster it belongs to. Returns ------- A dictionary of clusters to node lists """ try: clust = community.best_partition(graph) # attempt louvain method except: clust = {} # if clustering fails, assign all nodes to the same cluster for x in graph: clust[x] = 0 clustDict = {} for x in clust: graph.node[x]['cluster'] = clust[x] # tag nodes by clusterID if clust[x] in clustDict: # rework dictionary clustDict[clust[x]].append(x) else: clustDict[clust[x]] = [x] return clustDict
def groupGraph(G, userNodeId): """docstring for groupGraph""" G.node[userNodeId]['group'] = 0 Gc = nx.Graph(G) Gc.remove_node(userNodeId) if len(Gc.edges()) < 1: partition = {} for n in Gc.nodes(): partition[n] = 1 else: partition = community.best_partition(Gc) for nodes in partition.keys(): G.node[nodes]['group'] = partition[nodes] + 1 #For Connected Sub Graphs #Gcc=nx.connected_component_subgraphs(Gc) Gcc = [] for com in set(partition.values()) : list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] Gcc.append(G.subgraph(list_nodes)) for SG in Gcc: if len(SG.nodes()) > 3: bm, cm, dm = CentralityNoself(SG) G.node[bm]['central'] = 1 #G.node[em]['central'] = 2 G.node[cm]['central'] = 3 G.node[dm]['central'] = 4 return G, len(set(partition.values()))
def detect_communities(self): partition = community.best_partition(self.G) for n in partition: nx.set_node_attributes(self.G, 'community', {n: partition[n]}) self.l.append("community") return self
def prepare_network(df): df.set_index('yearID', inplace=True) # Create co-occurrence matrix cooc = df.dot(df.T) * (1 - np.eye(df.shape[0])) cooc.to_csv('cooc.csv') slicing = 3 weights = cooc[cooc >= slicing] weights = weights.stack() weights = weights / weights.max() cd_network = weights.to_dict() cd_network = {key: float(value) for key, value in cd_network.items()} player_network = nx.Graph() player_network.add_edges_from(cd_network) nx.set_edge_attributes(player_network, 'weight', cd_network) partition = community.best_partition(player_network) nx.set_node_attributes(player_network, 'part', partition) if not os.path.isdir('results'): os.mkdir('results') with open('results/player_network.graphml', 'wb') as ofile: nx.write_graphml(player_network, ofile) return
def get_topics_noun_phrases(num_news, draw=False, url='http://cnn.com'): texts = get_news(url, num_news) gb = NounPhraseGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) G = gb.create_graph() print "Graph built" partition = community.best_partition(G) words_by_part = get_words_by_partition(partition) print_topics_from_partitions(G, words_by_part, 10) mod = community.modularity(partition,G) print("modularity:", mod) #print_topics_from_partitions(G, words_by_part, 10) if draw: values = [partition.get(node) for node in G.nodes()] nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False) plt.show() topics = get_topics_from_partitions(G, words_by_part, 10) return G, topics
def add_metrics(g): """ Adds centrality metrics and community number attributes to each node in the given graph. Returns the graph with new node attributes. """ # Each function returns a dict keyed by node id with the computed metric as value deg_cent = nx.degree_centrality(g) close_cent = nx.closeness_centrality(g) between_cent = nx.betweenness_centrality(g) com = community.best_partition(g) # Only interested in communities with more than one member - get a list # of multimember communities, sorted by community number sorted_coms = get_sorted_multimember_coms(com) # Loop through nodes in the graph and give them new attributes for vertex in self.graph.node.keys(): g.node[vertex]["deg_cent"] = deg_cent[vertex] g.node[vertex]["close_cent"] = close_cent[vertex] g.node[vertex]["between_cent"] = between_cent[vertex] # Only nodes in a multimember community get a community number if com[vertex] in sorted_coms: # So community numbers start at 1, change community numbers to their position in the sorted_coms # list, plus 1 # e.g. first multimember community number may be 3, this makes it 0 (position in list) + 1 new_com_num = sorted_coms.index(com[vertex]) + 1 g.node[vertex]["com"] = new_com_num # If node not in a multimember community, gets False as com number attribute else: g.node[vertex]["com"] = False return g
def _partitions(graph): """ Internal use only. Finds partitions with louvain method and returns dict """ # Get Partitions with Louvain method part = community.best_partition(graph) # Structure them for info purpose parts = {} for item in part: n = part[item] # Partition number if n not in parts: # Check if list for partition exists parts[n] = [] # add node with degree in graph (for sorting) as tuples to list parts[n].append(item) # ((item, graph.degree()[item])) # Use degree to find name for category names = {key: max( [(item, graph.degree(weight='weight')[item]) for item in parts[key]], key=itemgetter(1)) [0] for key in parts} # New return dict. ToDo: make it like thisright away res = {key: {'name': names[key], 'categories': parts[key]} for key in parts} return res
def data(self, **kw): try: with closing(open('cache.json', 'r')) as data_file: print 'Reading from cache' return data_file.read() except IOError: print 'Fetching data' with closing(open('cache.json', 'w')) as data_file: foaf_graph = None try: with closing(open('graph_cache.json', 'r')) as graph_file: print 'Reading from graph cache' foaf_graph = jg.load(graph_file) except IOError: foaf_graph = retrieve_foaf(FBTOKEN) clusters = community.best_partition(foaf_graph) degree_distribution = get_histograms(foaf_graph) cluster_counts = get_cluster_counts(clusters) top10 = get_top_degree(foaf_graph, 10) foaf_json_graph = json.loads(jg.dumps(foaf_graph)) ob = foaf_graph.degree() infos = { 'graph':foaf_json_graph, 'clusters':clusters, 'cluster_counts':cluster_counts, 'degree_distribution':degree_distribution, 'degree':foaf_graph.degree(), 'top10':top10 } foaf_data = json.dumps(infos) data_file.write(foaf_data) return foaf_data
def detect_communities(self, graph, users, resolution, fraction): partitions = community.best_partition(graph.to_undirected(), resolution=resolution) counter = Counter(partitions.values()) number_of_nodes = sum(counter.values()) self._logger.info("Counter %s", counter) communities = [i for i in counter.items() if i[1] > fraction * number_of_nodes] self._logger.info("Number of nodes: %d", number_of_nodes) self._logger.info("Number of communities to map: %d", len(communities)) self._logger.info("Communities: %s", communities) partitions_to_com = dict.fromkeys(set(partitions.values()), CommunityUser.UNCLASSIFIED) output = {} for com, _ in communities: com_nodes = [users[n].get_classification() for n in partitions.keys() if partitions[n] == com] com_classes = Counter(com_nodes) self._logger.info("%d: %s", com, com_classes) partitions_to_com[com] = com_classes.most_common(1)[0][0] output[com] = (partitions_to_com[com], com_classes) for node in graph.nodes(): c = partitions[node] graph.node[node]["community"] = c graph.node[node]["classification"] = partitions_to_com[c] # json.dump(output, open("per_classification.txt", "w")) return graph
def detect_communities(graph, verbose=False): graph = graph_from_csv(graph) partition = community.best_partition(graph) if verbose: print "%i partitions" % len(set(partition.values())) nx.set_node_attributes(graph, 'partition', partition) return graph, partition
def detect_communities(graph, users, resolution=1.0, fraction=0.05): partitions = community.best_partition(graph.to_undirected(), resolution=resolution) counter = Counter(partitions.values()) number_of_nodes = sum(counter.values()) communities = [i for i in counter.items() if i[1] > fraction * number_of_nodes] partitions_to_com = dict.fromkeys(set(partitions.values()), CommunityUser.UNCLASSIFIED) output = {} for com, _ in communities: com_nodes = [users[n].get_classification() for n in partitions.keys() if partitions[n] == com] com_classes = Counter(com_nodes) partitions_to_com[com] = com_classes.most_common(1)[0][0] output[com] = (partitions_to_com[com], com_classes) for node in graph.nodes(): c = partitions[node] graph.node[node]["community"] = c graph.node[node]["classification"] = partitions_to_com[c] if partitions_to_com[c] != "Unclassified" and users[node].get_classification() != "Unclassified": if partitions_to_com[c] != users[node].get_classification(): print node, partitions_to_com[c], users[node].get_classification() wrongs.append({"user":node,"louvian": partitions_to_com[c], "class": users[node].get_classification()}) return graph
def cluster(self): #first compute the best partition partition = community.best_partition(self.G) # print partition category = [(c,) for i, c in partition.items()] # print category return category
def community_structure(G, candidates): partition = community.best_partition(G) to_return = {} candidates_found = 0 for candidate in candidates: to_return[candidate] = {} candidate_to_pacs_in_community = {} candidate_to_community_size = {} nodes_so_far = 0 candidates_per_community = [] pacs_per_community = [] community_size = [] for com in set(partition.values()) : list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] num_candidates = 0 num_pacs = 0 for node in list_nodes: if node[0:2] == 'C0': num_pacs += 1 if node in candidates: num_candidates += 1 for node in list_nodes: if node in candidates: to_return[node]['community_size'] = len(list_nodes) to_return[node]['pacs_in_community'] = num_pacs candidates_found += 1 print 'found ', node if (candidates_found >= len(candidates)): break for candidate in candidates: if 'community_size' not in to_return[candidate]: to_return[candidate]['community_size'] = 0 print 'didnt find ',candidate for candidate in candidates: if 'pacs_in_community' not in to_return[candidate]: to_return[candidate]['pacs_in_community'] = 0 return to_return
def _with_networkx(documents, threshold=1): G = nx.Graph() G.add_nodes_from(documents) nodes = G.nodes() for i, node in enumerate(nodes): for other in nodes[i+1:]: a = set(node.keywords) b = set(other.keywords) intersection = a.intersection(b) if len(intersection) > threshold: G.add_edge(node, other) G[node][other]['weight'] = len(intersection) # remove any isolated vertices before we perform community detection orphans = [] for node in G.nodes(): if not G.neighbors(node): G.remove_node(node) orphans.append(node) partition_lookup = community.best_partition(G).iteritems() G.add_nodes_from(orphans) partitions = {node.r_id: value for node, value in partition_lookup} as_json = json_graph.node_link_data(G) frontend_compatable = {} frontend_compatable['nodes'] = [node['id'] for node in as_json['nodes']] for node in frontend_compatable['nodes']: if G.neighbors(node): node.partition = partitions[node.r_id] frontend_compatable['nodes'] = [json.loads(node.to_json()) for node in frontend_compatable['nodes']] for node in frontend_compatable['nodes']: if node['_id'] in partitions: node['partition'] = partitions[node['_id']] frontend_compatable['edges'] = as_json['links'] return frontend_compatable
def louvain_method(G): partition = community.best_partition(G) print "Graph nodes:", len(G.nodes()), "egdes:", len(G.edges()) print "Partitions:", len(set(partition.values())),\ "Modularity:", community.modularity(partition, G.to_undirected()) print "\n\n" return partition
def create_3comms_bipartite(n,m,p,No_isolates=True): import community as comm from networkx.algorithms import bipartite as bip u=0 while True: G=nx.bipartite_random_graph(n,m,p) list_of_isolates=nx.isolates(G) if No_isolates: G.remove_nodes_from(nx.isolates(G)) partition=comm.best_partition(G) sel=max(partition.values()) if sel==2 and nx.is_connected(G): break u+=1 print u,sel ndlss=bip.sets(G) ndls=[list(i) for i in ndlss] slayer1=ndls[0] slayer2=ndls[1] layer1=[i for i,v in partition.items() if v==0] layer2=[i for i,v in partition.items() if v==1] layer3=[i for i,v in partition.items() if v==2] edgeList=[] for e in G.edges(): if (e[0] in slayer1 and e[1] in slayer2) or (e[0] in slayer2 and e[1] in slayer1): edgeList.append(e) return G,layer1,layer2,layer3,slayer1,slayer2,edgeList,partition
def printStats(filename): ''' Converts json adjacency list into networkx to calculate and print the graphs's - average clustering coefficient - overall clustering coefficient - maximum diameter - average diameter - number of paritions using community.best_parition - modularity of community.best_partition ''' g = makeGraphFromJSON(filename) print "Average Clustering Coefficient: %f" % nx.average_clustering(g) print "Overall Clustering Coefficient: %f" % nx.transitivity(g) connected_subgraphs = list(nx.connected_component_subgraphs(g)) largest = max(nx.connected_component_subgraphs(g), key=len) print "# Connected Components: %d" % len(connected_subgraphs) print " Maximal Diameter: %d" % nx.diameter(largest) print " Average Diameter: %f" % nx.average_shortest_path_length(largest) # Find partition that maximizes modularity using Louvain's algorithm part = community.best_partition(g) print "# Paritions: %d" % (max(part.values()) + 1) print "Louvain Modularity: %f" % community.modularity(part, g)
def evaluate(): texts = get_texts() gb = words_graph.SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False) #gb = words_graph.SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) #gb = words_graph.WindowGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False) #gb = words_graph.NounPhraseGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) gb.load_texts(texts) G = gb.create_graph() partition = community.best_partition(G) #words_by_part = topics.get_words_by_partition(partition) words_by_part = graph_cluster.get_overlap_clusters(G, 9, 1) computed_topics = topics.get_topics_from_partitions(G, words_by_part) #Word splitter # computed_topics2 = [] # for topic in computed_topics: # new_topic = [] # for phrase in topic: # new_topic.extend(phrase.split(' ')) # computed_topics2.append(new_topic) print compute_score(computed_topics, true_topics)
def graphmltojson(graphfile, outfile): """ Converts GraphML file to json while adding communities/modularity groups using python-louvain. JSON output is usable with D3 force layout. Usage: >>> python convert.py -i mygraph.graphml -o outfile.json """ G = nx.read_graphml(graphfile) G = nx.Graph(G) #G = nx.DiGraph.to_undirected(G) #karate = Nexus.get(G) #cl = karate.community_fastgreedy() #k = 57 #cl.as_clustering(k).membership #finds best community using louvain partition = community.best_partition(G) #adds partition/community number as attribute named 'modularitygroup' for n,d in G.nodes_iter(data=True): d['group'] = partition[n] node_link = json_graph.node_link_data(G) json = json_graph.dumps(node_link) # Write to file fo = open(outfile, "w") fo.write(json); fo.close()
def test_karate(self): """"test modularity on Zachary's karate club""" graph = nx.karate_club_graph() part = co.best_partition(graph) self.assertTrue(co.modularity(part, graph) > 0.41) for e1, e2 in graph.edges_iter(): graph[e1][e2]["test_weight"] = 1. part_weight = co.best_partition(graph, weight="test_weight") self.assertAlmostEqual(co.modularity(part, graph), co.modularity(part_weight, graph, "test_weight")) part_res_low = co.best_partition(graph, resolution=0.1) self.assertTrue( len(set(part.values())) < len(set(part_res_low.values())))
def plot_modularity(G): outFile = outDir + 'communities' modularity = collections.Counter() mod = community.best_partition(G) modList = mod.values() for i in np.arange(len(modList)): modularity[modList[i]] += 1 mean = np.mean(modularity.values()) std_dev = np.std(modularity.values()) start = min(modularity.keys(), key=int) end = max(modularity.keys(), key=int) fig, ax = pylab.subplots() ax.scatter(modularity.keys(),modularity.values(),color=colors[0]) pylab.axhline(mean,color=colors[2],label="mean") pylab.axhline(mean+std_dev,color=colors[1],label="standard deviation") pylab.axhline(mean-std_dev,color=colors[1]) ax.set_ylabel('Number of LPs') ax.set_xlabel('Modularity Class') ax.ticklabel_format(useOffset=False) ya = ax.get_yaxis() ya.set_major_locator(pylab.MaxNLocator(integer=True)) pylab.xticks(np.arange(start, end+1,10)) # change 10 to 1 (or smaller number) if # of communities is small pylab.title('Communities in LP Communication Graph') pylab.legend(loc='best', shadow=True) display_graph(outFile)
def main(): genedict = read_raw_csv(finalURL) filterGenes = json.load(open(filterURL, 'r')) tempdict = {} gtoidict = {} itogdict = {} for movie in genedict.keys(): for gene in genedict[movie]['Genes']: if gene in filterGenes: continue gene = str(gene).lower() tempdict[gene] = 0 count = 0 for gene in tempdict.keys(): gtoidict[gene] = count itogdict[count] = gene count += 1 G = fill_Graph(genedict, count, gtoidict, itogdict, filterGenes) mat = NX.to_numpy_matrix(G) res = C.best_partition(G) get_components_of_G(mat, res, gtoidict, itogdict)
def testGraph(fileName, r=1): G = nx.read_graphml(fileName) partitions = community.best_partition(G, resolution=r) inv_map = {} for k, v in partitions.iteritems(): inv_map[v] = inv_map.get(v, []) inv_map[v].append(k) intra_community_distance = {} for c in inv_map: intra_community_distance[c] = get_inter_nodes_distance(G, inv_map[c]) inter_community_distance = {} for c1, c2 in combinations(inv_map.keys(), 2): inter_community_distance[(c1, c2)] = get_intra_community_distance(G, inv_map[c1], inv_map[c2]) inter_node_ratios = {} for c1, c2 in inter_community_distance: inter_node_ratios[(c1, c2)] = np.power(inter_community_distance[(c1, c2)], 2) / ( intra_community_distance[c1] * intra_community_distance[c2] ) pprint.pprint(intra_community_distance) pprint.pprint(inter_community_distance) # pprint.pprint(inter_node_ratios) return np.average(inter_node_ratios.values()), np.std(inter_node_ratios.values())
def t_delta_partition(t_delta_matrix,sm,verbose=False): import community; g=nx.to_networkx_graph(t_delta_matrix+t_delta_matrix.T - np.diag(t_delta_matrix.diagonal()) ,create_using=nx.Graph()); if verbose==True: plt.figure, plt.pcolor(np.array(nx.to_numpy_matrix(g))), plt.colorbar(); plt.show() return community.best_partition(g);
def assign_community(graph): g=nx.Graph(graph) partition=community.best_partition(g) print "Partition found: ",len(set(partition.values())) for n in g.nodes_iter(): g.node[n]["partition"]=partition[n] return g
def saveCluster(outPutDirectory, subGraphs, idToApps, edgeLimit, outputName ): counter = 0 for graph in subGraphs: if graph.number_of_nodes() > 1: output = open(str(outPutDirectory) + "/subGraphNodesFiltered_" + outputName + "_" + str(int(edgeLimit)) + "_" + str(counter) + ".txt", 'w') for node in graph.nodes(): output.write(str(idToApps[node]) + "\n") output.close() counter += 1 if graph.number_of_nodes() > 50: partition = community.best_partition(graph) counter2 = 0 processing = 1 while(processing == 1): processing = 0 for p in partition: if partition[p] == counter2: processing = 1 if processing == 1: output2 = open(str(outPutDirectory) + "/subGraphNodesFilteredBigCluster_"+ outputName + "_" + str(int(edgeLimit)) + "_" + str(counter2) + ".txt", 'w') for p in partition: if partition[p] == counter2: output2.write(str(idToApps[p]) + "\n") output2.close() counter2 += 1
def InitClusterAnalysis(graph): global comMemClean global comMemNames global comsizeClean global partition print "starting best partition algorithm (will take a while)...." partition = community.best_partition(graph) modularity = community.modularity(partition, graph) LogPrint("the modularity is %f"%modularity) if partition !=None: for node in partition.iteritems(): if comSize.has_key(node[1]): comSize[node[1]]= comSize[node[1]]+1 comMem[node[1]].append(node[0]) else: comSize[node[1]]=1 comMem[node[1]]=[] for cSize in comSize.iteritems(): if cSize[1] >1: print "cSize[1]=",cSize[1] comsizeClean[cSize[0]] =cSize[1] if len(comMem[cSize[0]])==1: print "way this value is only one member...",comMem[cSize[0]] comMemClean[cSize[0]] = comMem[cSize[0]] for memberIDs in comMemClean.iteritems(): comMemNames[memberIDs[0]]=[] for member in memberIDs[1]: comMemNames[memberIDs[0]].append(utils.GetNodeName(member,graph))
def getRandomPageRanks(filename): Ga=nx.read_graphml(sys.argv[1]) # create a copy of the graph and extract giant component # get component size distribution cc=nx.connected_components(Ga) cc_dict={} for x in range(0,len(cc)): try: cc_dict[len(cc[x])].append(x) except KeyError: cc_dict[len(cc[x])]=[] cc_dict[len(cc[x])].append(x) isolates=nx.isolates(Ga) rg=nx.fast_gnp_random_graph(Ga.number_of_nodes(),2.0*Ga.number_of_edges()/(Ga.number_of_nodes()*(Ga.number_of_nodes()-1))) c_rg=nx.average_clustering(rg) rg_cc=nx.connected_component_subgraphs(rg)[0] rg_asp=nx.algorithms.shortest_paths.generic.average_shortest_path_length(rg_cc) p_rg=community.best_partition(rg_cc) m_rg=community.modularity(p_rg,rg_cc) pageranks = nx.pagerank_numpy(rg) return pageranks
def analyze_graph(G): #centralities and node metrics out_degrees = G.out_degree() in_degrees = G.in_degree() betweenness = nx.betweenness_centrality(G) eigenvector = nx.eigenvector_centrality_numpy(G) closeness = nx.closeness_centrality(G) pagerank = nx.pagerank(G) avg_neighbour_degree = nx.average_neighbor_degree(G) redundancy = bipartite.node_redundancy(G) load = nx.load_centrality(G) hits = nx.hits(G) vitality = nx.closeness_vitality(G) for name in G.nodes(): G.node[name]['out_degree'] = out_degrees[name] G.node[name]['in_degree'] = in_degrees[name] G.node[name]['betweenness'] = betweenness[name] G.node[name]['eigenvector'] = eigenvector[name] G.node[name]['closeness'] = closeness[name] G.node[name]['pagerank'] = pagerank[name] G.node[name]['avg-neigh-degree'] = avg_neighbour_degree[name] G.node[name]['redundancy'] = redundancy[name] G.node[name]['load'] = load[name] G.node[name]['hits'] = hits[name] G.node[name]['vitality'] = vitality[name] #communities partitions = community.best_partition(G) for member, c in partitions.items(): G.node[member]['community'] = c return G
'eleinamazing', 'A_boy_and_his_boston', 'lebbe', 'GlobTrotters', 'Nichchk', 'hellobutno', 'Moskau50', 'Turd111', 'RogueSexToy', 'Blackhk', '22_hours_ago', 'humanity_is_doomed', 'ASketchyLlama', 'leftrighttopdown', 'IronKanabo', 'ZWF0cHVzc3k', 'simian_ninja', 'Eitoku_K', 'pomelopomelo' ] user_graph, weight_dict, id_user_dic = query_api("2019-07-01", 30, 10000, july_karma_list) weight_edges = [(x, y, val) for (x, y), val in weight_dict.items()] user_graph.add_weighted_edges_from(weight_edges) july_final_karma = fake_list + july_karma_list if action == 'attack': run_attack(user_graph, july_final_karma, weight_dict) elif action == 'partition': run_partition(user_graph, july_final_karma, weight_dict) elif action == 'pruning': run_pruning(user_graph, july_final_karma, weight_dict) elif action == 'practical': # practical implementation for ele in july_karma_list: user_graph.remove_node(ele) if ele in weight_dict: del weight_dict[ele] # Community detection attack_graph = nx.read_gml("m3_oct_attack.gml") part3 = community.best_partition(attack_graph) origin_graph = nx.read_gml("m2_aug.gml") part1 = community.best_partition(origin_graph) defense_graph = nx.read_gml("m3_oct_defense.gml") part2 = community.best_partition(defense_graph)
#python src\graph-construction\louvain.py -e data/normalized/points_delaunay_Chinese_edge_full.csv -d data/normalized/louvain_dict_edge.json -o data/normalized/louvain_edge.csv parser = argparse.ArgumentParser(description='Detect communities with Louvain') parser.add_argument('--edgefile', '-f', help='csv of edges', default='data/graph_calgary_knn_20.csv') parser.add_argument('--dictfile', '-o', help='outfile', default="data/louvain_calgary_dict_knn_20.json") args = parser.parse_args() edge = pd.read_csv(args.edgefile, ' ', header=0) graph = nx.from_pandas_edgelist(edge, source='r1', target='r2') partition = community.best_partition( graph) #dictionarity of node_id --> community # # assignments = {} #dict from community # --> list of nodes in community # for part, idx in partition.iteritems(): # if idx not in assignments: # assignments[idx] = [part] # else: # assignments[idx].append(part) # # print assignments # with open(args.outfile, "w+") as f: # for idx, assignment in tqdm(assignments.iteritems()): # print len(assignment) # # print assignment # f.write(", ".join(assignment)) # f.write("\n")
def calculateModularity(network): bestPartition = community.best_partition(network) return community.modularity(bestPartition, network)
# create separate file to save the degree of each node degrees = cur_graph.degree() with open( infilename.replace(".csv", "") + "-degree-dist-pval" + str(alpha) + ".csv", "w") as out_file: w = csv.DictWriter(out_file, degrees.keys()) w.writeheader() w.writerow(degrees) # calculate number of nodes w/ degree > 0 and total number of edges num_nodes = len(cur_graph.nodes()) num_edges = len(cur_graph.edges()) # calculate clustering coefficients avg_clustering_coeff = nx.average_clustering(cur_graph) # calculate shortest paths avg_shortest_path_length = nx.average_shortest_path_length(cur_graph) # calculate communities and modularity value partition = community.best_partition(cur_graph) num_communities = len(set(partition.values())) modularity_value = community.modularity(partition, cur_graph) network_outfile.write( str(alpha) + "," + str(num_nodes) + "," + str(num_edges) + "," + str(avg_clustering_coeff) + "," + str(avg_shortest_path_length) + "," + str(num_communities) + "," + str(modularity_value) + "\n")
# time = datetime.now().strftime('%H:%M:%S') # print "*** Starting ploting whole graph... @:" + time # igraph.plot(g, "complete-graph.pdf", **visual_style) # time = datetime.now().strftime('%H:%M:%S') # print "*** Graph ploting ENDED ... @:" + time # make the graph with the networkX library gx = nx.Graph() # this is undirected graph # gx = nx.DiGraph() # this is directed graph gx.add_nodes_from(nodes) gx.add_edges_from(edges) # find communities time = datetime.now().strftime('%H:%M:%S') print "*** Starting community calculation... @:" + time clusters = community.best_partition(gx) time = datetime.now().strftime('%H:%M:%S') print "*** Community calculation ENDED ... @:" + time # time = datetime.now().strftime('%H:%M:%S') # print "*** Starting community ploting... @:" + time # nx.draw_spring(gx, cmap=plt.get_cmap('jet'), node_color='#A0CBE2',edge_color='#BB0000', node_size=25, with_labels=False) # plt.savefig("communities-graph.pdf") # time = datetime.now().strftime('%H:%M:%S') # print "*** Community ploting ENDED ... @:" + time # Find Communities with iGraph (had problem) # clusters = g.community_multilevel(return_levels=True) # igraph.plot(clusters, "communities-graph.png", mark_groups=True, **visual_style) # an adjacency list is a collection of unordered lists used to represent a finite graph.
rows = edgecsv.read().split('\n') edges = [r.split(',')[:2] for r in rows[1:]] weights = [r.split(',')[-1] for r in rows[1:]] edge_tuples = [(e[0], e[1], int(weights[i])) for i, e in enumerate(edges)] # Only get edges for the select nodes in the node csv. edges = [] for e in edge_tuples: if all(x in list(node_ids) for x in e[:2]): edges.append(e) # Initialize graph, add nodes and edges, calculate modularity and centrality. G = nx.Graph() G.add_nodes_from(list(node_ids)) G.add_weighted_edges_from(edges) groups = community.best_partition(G) degree = cn.degree_centrality(G) betweenness = cn.betweenness_centrality(G, weight='weight') eigenvector = cn.eigenvector_centrality(G, weight='weight') # Add node attributes for name, modularity, and three types of centrality. nx.set_node_attributes(G, 'name', node_dict) nx.set_node_attributes(G, 'group', groups) nx.set_node_attributes(G, 'degree', degree) nx.set_node_attributes(G, 'betweenness', betweenness) nx.set_node_attributes(G, 'eigenvector', eigenvector) # Create json representation of the graph (for d3). data = json_graph.node_link_data(G) # You could create the needed json without NetworkX (but you would forfeit network metrics).
import community import networkx as nx import matplotlib.pyplot as plt #better with karate_graph() as defined in networkx example. #erdos renyi don't have true community structure G = nx.erdos_renyi_graph(30, 0.05) #first compute the best partition partition = community.best_partition(G) #drawing size = float(len(set(partition.values()))) pos = nx.spring_layout(G) count = 0. for com in set(partition.values()): count = count + 1. list_nodes = [ nodes for nodes in partition.keys() if partition[nodes] == com ] nx.draw_networkx_nodes(G, pos, list_nodes, node_size=20, node_color=str(count / size)) nx.draw_networkx_edges(G, pos, alpha=0.5) plt.show()
def clustering_scores(args, latent, labels, cells, dataset, suffix, tlabels, louvain_num=15, prediction_algorithm="knn", X_tf=None, ensemble=False, batch_indices=None, save_cluster=False, seed=42): from scipy.spatial import distance vec = latent mat = kneighbors_graph(latent, louvain_num, mode='distance', include_self=True).todense() print('mat', mat.shape) alg = 'louvain' if alg == 'louvain': labels_pred = [] G = nx.from_numpy_matrix(mat) partition = community.best_partition(G, random_state=seed) for i in range(mat.shape[0]): labels_pred.append(partition[i]) elif alg == 'leiden': vcount = max(mat.shape) sources, targets = mat.nonzero() edgelist = zip(sources.tolist(), targets.tolist()) g = ig.Graph(vcount, edgelist) partition = leidenalg.find_partition( g, leidenalg.ModularityVertexPartition) print(partition.membership) labels_pred = partition.membership labels_pred = np.array(labels_pred) if args.plot == 'tsne': embedding = TSNE(random_state=seed, perplexity=50).fit_transform(vec) elif args.plot == 'umap': embedding = umap.UMAP(random_state=42).fit_transform(vec) print('pred labels is', labels_pred.shape, np.max(labels_pred), vec[0, :5], embedding[:5], labels_pred[:100]) print('labels is', np.array(labels).shape) show_tsne( embedding, labels_pred, 'result/%s/%s-GMVAE-%s-%s-pred.png' % (dataset, suffix, 'alpha-gan', ensemble)) np.savetxt('result/%s/labels.txt' % (dataset), labels_pred) #if labels is not None: result_filename = 'result/%s-%d-%d-%d-cluster_result.csv' % ( dataset, args.n_hidden, args.n_latent, louvain_num) if len(labels) == 0: with open(result_filename, 'w') as f: f.write('cell,predicted label,tsne-1,tsne-2\n') for cell, pred, t in zip(cells, labels_pred, embedding): f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1])) if batch_indices is not None: print('batch', batch_indices) show_tsne(embedding, batch_indices, 'result/%s/%s-%s-batch.png' % (dataset, suffix, 'alpha-gan'), tlabels=batch_indices) else: show_tsne(embedding, labels, 'result/%s/%s-GMVAE-%s-%s-true.png' % (dataset, suffix, 'alpha-gan', ensemble), tlabels=tlabels) if batch_indices is None: with open(result_filename, 'w') as f: f.write('cell,tlabel id,label,predicted label,tsne-1,tsne-2\n') for cell, label, tlabel, pred, t in zip( cells, labels, tlabels, labels_pred, embedding): f.write('%s,%d,%s,%d,%f,%f\n' % (cell, label, tlabel, pred, t[0], t[1])) else: with open(result_filename, 'w') as f: f.write( 'cell,tlabel id,label,predicted label,tsne-1,tsne-2,batch\n' ) for cell, label, tlabel, pred, t, batch in zip( cells, labels, tlabels, labels_pred, embedding, batch_indices): f.write('%s,%d,%s,%d,%f,%f,%d\n' % (cell, label, tlabel, pred, t[0], t[1], batch)) #print(labels, labels_pred, latent) #asw_score = silhouette_score(latent, labels) asw_score = 0 nmi_score = NMI(labels, labels_pred) ari_score = ARI(labels, labels_pred) homo_score = homogeneity_score(labels, labels_pred) #uca_score = unsupervised_clustering_accuracy(labels, labels_pred) print( "Clustering Scores:\nHOMO: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f" % (homo_score, nmi_score, ari_score, 0)) if batch_indices is not None: show_tsne(embedding, batch_indices, 'result/%s/%s-%s-batch.png' % (dataset, suffix, 'alpha-gan'), tlabels=batch_indices) return asw_score, nmi_score, ari_score, 0
print c_g print "##########now is the degree sequence########## " d_g = graph_utils.get_degrees(A) d_b = graph_utils.get_degrees(B_matrix) print "this is the degree of the original Graph G and new graph B" print d_b print d_g cov = nm.cov(d_b, d_g, ddof=0)[0][1] standard_d_b = nm.std(d_b, ddof=0) standard_d_g = nm.std(d_g, ddof=0) ppcc = cov / (standard_d_b * standard_d_g) print(str(ppcc)) d_pcc.append(ppcc) print(str(cov)) print "##########now is the partition##########" p_g = community.best_partition(G) p_b = community.best_partition(B) c_a_g = graph_utils.average_clustering(A) c_a_b = graph_utils.average_clustering(B_matrix) print "this is the partition of the original Graph G and new graph B" print p_b print p_g list1 = list(p_b.values()) list2 = list(p_g.values()) temp = 0 for i in range(len(list1)): if list1[i] == list2[i]: temp += 1 print float(temp) / len(list1) partiton_ratio.append(float(temp) / len(list1)) print "this is the avg clustering of the original Graph G and new graph B"
def rodando_louvain(self, porcentagem_do_sample): self.criando_matriz_de_similaridade( porcentagem_do_sample=porcentagem_do_sample) self.clusters = (community.best_partition( self.G, weight='weight', randomize=True))
def get_data_v3(cuda=True): # Here the data is obtained from pytorch-geometric to eliminate unnecessary shuffling done in Kipf's code edge_index = pk.load(open("graph.pkl", "rb")) row, col = edge_index edges = [(int(u), int(v)) for u, v in zip(row.tolist(), col.tolist())] g = nx.Graph() g.add_edges_from(edges) print("Graph Read ") nnodes = nx.number_of_nodes(g) nodes = nx.nodes(g) #print(nodes) cr = dict(nx.core_number(g)) cr_vals = set(v for v in cr.values()) cr_dict = {} for d in cr_vals: tmp = [] for k, v in cr.items(): if v == d: tmp.append(k) cr_dict[d] = tmp print("core numbers of original graph", len(cr_vals)) print("number of nodes--", nnodes) cut = int(0.1 * nnodes) print("cut value--", cut) #print("number of nodes,edges ",g.number_of_nodes(),g.number_of_edges()) adj = np.zeros( (torch.max(edge_index).item() + 1, torch.max(edge_index).item() + 1)) for u, v in list(g.edges()): adj[u, v] = 1 adj[v, u] = 1 adj = nx.to_numpy_array(g, dtype=np.float) adj = adj + np.eye(adj.shape[0]) adj = sp.sparse.coo_matrix(adj) print("Adjacency Made") adj = torch.FloatTensor(adj.todense()) features = pk.load(open("feature.pkl", "rb")) features = normalize_features(features.numpy()) features = torch.FloatTensor(features) print("Features Normalized ") labels = pk.load(open("label.pkl", "rb")) lb = labels.numpy() ground_dict = Counter(lb) classes = len(ground_dict) #community detection --Infomap info = infomap.Infomap("--two-level --silent -s 8") for e in list(g.edges()): info.addLink(*e) info.run() c = info.getModules() #node:community z = defaultdict(list) for u in c: z[c[u]].append(u) #community:[nodes] #print("number of communities detected") #print (len(z)) com_size = {} for k, v in z.items(): com_size[k] = len(v) #print(com_size) #community detection-- Louvain partition = community.best_partition(g) #node:community com = defaultdict(list) for p in partition: com[partition[p]].append(p) print("number of communities detected") print(len(com)) a = set() a_wt = [] for te in edges: u = te[0] v = te[1] com_u = partition[u] com_v = partition[v] t = (com_u, com_v) a.add(t) if com_u > com_v: m = (com_v, com_u) a_wt.append(m) else: a_wt.append(t) edge_wt = Counter(a_wt) #print(edge_wt) meta_wt_edge = {} #print(len(a)) meta_nodes = list(com.keys()) #print (len(meta_nodes)) per = list(permutations(meta_nodes, 2)) b = set() for cc in per: b.add(cc) meta_edge = a.intersection(b) for k, v in edge_wt.items(): if k in meta_edge: meta_wt_edge[k] = v #print("meta edges") #print(meta_wt_edge) meta_net = nx.Graph() meta_net.add_nodes_from(meta_nodes) meta_net.add_edges_from(meta_edge) print("meta graph formed") m_nodes = nx.number_of_nodes(meta_net) print("number of meta nodes", m_nodes) m_edges = meta_net.number_of_edges() print("number of meta edges", m_edges) train_ids = [] edge_set = set(edges) for m in meta_nodes: coms = com[m] perm = set(permutations(coms, 2)) in_edges = edge_set.intersection(perm) #print(in_edges) in_net = nx.Graph() in_net.add_edges_from(in_edges) #print(in_net.edges()) in_clus = nx.clustering(in_net) #print("clustering",in_clus) h = max(in_clus.items(), key=operator.itemgetter(1))[0] train_ids.append(h) #meta_edgelist = list(meta_net.edges()) '''cores = dict(nx.core_number(meta_net)) mst = nx.minimum_spanning_tree(meta_net, algorithm='prim') #print("tree edges",mst.edges()) mst_edgelist = list(sorted(mst.edges())) mst_nodes = list(mst.nodes()) mst_adj = {} for s in mst_nodes: mst_l = [] for e in mst_edgelist: if s == e[0] : mst_l.append(e[1]) mst_adj[s] = mst_l #print(mst_adj) #print(mst_edgelist) core_vals = set(v for v in cores.values()) core_dict = {} for d in core_vals: tmp = [] for k,v in cores.items(): if v == d: tmp.append(k) core_dict[d] = tmp''' #print(core_dict) #print ("number of cores in meta network:", len(core_dict)) '''core_class = {} for k,v in core_dict.items(): cls = [] for m in v: nd = z[m] for x in nd: cl = lb[x] cls.append(cl) core_lb = Counter(cls) mm = max(v for k,v in core_lb.items()) for k1,v1 in core_lb.items(): if v1 == mm: core_class[k]=k1 print("class information per core--") print(core_class) #The class information/core is printed com_class = {} for mn in meta_nodes: cls = [] nd = z[mn] for x in nd: cl = lb[x] cls.append(cl) com_lb = Counter(cls) mm = max(v for k,v in com_lb.items()) for k1,v1 in com_lb.items(): if v1 == mm : com_class[mn] = k1 print("class information per community--") #print(com_class) #The class information/community is printed com_cls = [] for k,v in com_class.items(): com_cls.append(v) print(Counter(com_cls)) sorted_core = dict(OrderedDict(sorted(core_dict.items(),reverse=True))) reverse_core = dict(OrderedDict(sorted(sorted_core.items())))''' '''t_n = [] for v in sorted_core[25]: for t in z[v]: t_n.append(t) t_lb = [] for t in t_n: t_lb.append(lb[t])''' #for checking the class labels distribution in each core #build 2nd order network-- '''meta_info = infomap.Infomap("--two-level --silent -s 8") for e in list(meta_net.edges()): meta_info.addLink(*e) meta_info.run() cc = meta_info.getModules() #node:community zz = defaultdict(list) for u in cc: zz[cc[u]].append(u) #community:[nodes] print("number of meta communities detected") print (len(zz)) meta_coms = {} for k,v in zz.items(): cls = [] for b in v: lbl = com_class[b] cls.append(lbl) metacom_lb = Counter(cls) meta_coms[k] = metacom_lb print("class information of meta communities of 2nd order network--") print(meta_coms) meta_cr = dict(nx.core_number(meta_net)) meta_cr_vals = set(v for v in meta_cr.values()) meta_cr_dict = {} for d in meta_cr_vals: tmp = [] for k,v in meta_cr.items(): if v == d: tmp.append(k) meta_cr_dict[d] = tmp print("cores in 2nd order network--") print(meta_cr_dict) #Selection of training nodes core_window =3 t_cores = [] cnt = 0 for cr,coms in sorted_core.items(): t_cores.append(cr) cnt += 1 if cnt == core_window: break print("t_cores--",t_cores) #print("t_coms--",len(t_coms)) #build adjacency matrix of edges-- t_coms = core_dict[7] p = len(t_coms) rows,cols = (p,p) adje = [[0]*cols]*rows for me in meta_edgelist: u = me[0] v = me[1] if u in t_coms: if v in t_coms: #h += 1 ui = t_coms.index(u) vi = t_coms.index(v) adje[ui][vi] += 1 #print(adje)''' '''for me in meta_edge: u = me[0] if u == 5: print(me)''' '''t_arr = [] for i in range(core_window): t_arr.append(0) tr_dict = {} for cls in range(classes): tr_nodes = [] fl = 0 ar = 0 cnt_cls = int(0.1*(ground_dict[cls])) print("cls and count--",cls,cnt_cls) while(True): for cr in t_cores: coms = core_dict[cr] j = t_arr[ar] cm = coms[j] j = (j+1)%len(coms) t_arr[ar] = j ar += 1 #cm = int(np.random.choice(coms,1)) nn = z[cm] n = int(np.random.choice(nodes,1)) l = lb[n] if l == cls and n not in tr_nodes: tr_nodes.append(n) if len(tr_nodes) == cnt_cls: fl = 1 break if ar == core_window: ar = 0 if fl == 1: tr_dict[cls] = tr_nodes break t_lbls = [] for k,v in tr_dict.items(): for t in v: lbl = lb[t] t_lbls.append(lbl) print("class level distribution--training labels",Counter(t_lbls)) train_ids = [] val_ids = [] test_ids = [] test_mask_ids = [] for k,v in tr_dict.items(): for t in v: train_ids.append(t) #for n in nodes2: #train_ids.append(n) f = 0 while True: if len(train_ids)<cut: r = int(np.random.choice(nodes,1,replace = False)) if r not in train_ids: train_ids.append(r) if len(train_ids)==cut: f = 1 if f == 1: break #print("train ids--",len(train_ids))''' #sorted_core = dict(OrderedDict(sorted(core_dict.items(),reverse=True))) #print(sorted_core) #c_meta_nodes = sorted_core[7] #y = int(np.random.choice(c_meta_nodes,1)) #train_ids = [] #train_coms = bfs(mst_adj,y) #print(train_coms) '''f = 0 while True: for tc in train_coms: yy = z[tc] x = int(np.random.choice(yy,1)) train_ids.append(x) if len(train_ids) == cut : f = 1 break if f == 1: break else: continue''' #print(train_ids) #train-test nodes choice '''for m in meta_nodes: f_nodes = z[m] x = int(np.random.choice(f_nodes,1,replace=False)) train_ids.append(x)''' val_ids = [] test_ids = [] rm_ids = [] for n in nodes: if n not in train_ids: #if n not in nodes2: rm_ids.append(n) #print ("test ids--",len(test_ids)) #val_ids.extend(rm_ids[0:int(0.1*len(nodes))]) val_ids = np.random.choice(rm_ids, len(train_ids), replace=False) r_ids = [] for n in rm_ids: if n not in val_ids: r_ids.append(n) #val_ids= np.random.choice(test_ids,int(0.1*len(nodes)),replace= False) test_ids = np.random.choice(r_ids, 1084, replace=False) #val_ids = np.random.choice(test_ids,int(0.1*len(nodes)),replace= False) #test_mask_ids = np.random.choice(test_ids,1084,replace = False) with open("test_labels_infomap.txt", 'wb') as fp: pk.dump(test_ids, fp) with open("training_labels_infomap.txt", "wb") as fp: pk.dump(train_ids, fp) idx_train = np.array(train_ids) idx_val = np.array(val_ids) idx_test = np.array(test_ids) print("Train Validation Test ", len(idx_train), len(idx_val), len(idx_test)) if cuda: features = features.cuda() adj = adj.cuda() labels = labels.cuda() #idx_train = idx_train.cuda() #idx_val = idx_val.cuda() #idx_test = idx_test.cuda() #return g,adj,features,labels,idx_train,idx_val,idx_test return idx_train, idx_test, idx_val
g = nx.read_graphml(sys.argv[1]) # no self-loops g.remove_edges_from(g.selfloop_edges()) if g.order() == 0: print 'The graph contains no nodes' sys.exit(1) else: try: # determine Louvain modularity score of entire graph if g.size() == 0: # no edges, modularity is 1 mod = 1.0 else: part = com.best_partition(g) mod = com.modularity(part, g) except Exception as e: print 'An exception occurred during modularity analysis' exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print exc_type, fname, exc_tb.tb_lineno sys.exit(1) # count isolates n_iso = len(nx.isolates(g)) # mean degree mean_deg = np.mean(nx.degree(g).values()) # median degree
def plot_graph(in_dir, item): '''plot the communities graph which nodes and edges are extracted from the json file coocnetworks.json built by the function describe_corpus: { "nodes":[ {"type":"AU","name":0,"item":"Abanades S","size":8}, ................................................... ], "links":[ {"type":"AU","source":0,"target":8,"Ncooc":5}, .............................................. ] } where type = "AU", "S", "I", "CU", "S2", "IK", "AK", "TK", "R", "RJ" Returns the graph G. ''' # Standard library imports import json import pprint from pathlib import Path # 3rd party imports import community as community_louvain import matplotlib.cm as cm import matplotlib.pyplot as plt import networkx as nx import numpy as np import pandas as pd # Local imports from .BiblioSpecificGlobals import LABEL_MEANING from .BiblioSpecificGlobals import VALID_LABEL_GRAPH assert (item in VALID_LABEL_GRAPH),\ f'unknown type {TYPE}: should be {", ".join(VALID_LABEL_GRAPH)}' # Extract nodes and edgesSets the graph from the json coocnetworks.json # for type=TYPE # ----------------------------------------------------------- file_coocnetworks = in_dir / Path('coocnetworks.json') with open(file_coocnetworks, 'r') as read_file: cooc = json.load(read_file) df = pd.DataFrame(cooc['links']).query('type==@item') G = nx.from_pandas_edgelist(df, source='source', target='target') dg = pd.DataFrame(cooc['nodes']).query('type==@item') G.add_nodes_from(dg['name']) for index, row in dg.iterrows(): src_attr_dict = {k: row.to_dict()[k] for k in ['item', 'size']} G.nodes[row['name']].update(src_attr_dict) # compute the best partition partition = community_louvain.best_partition(G) nx.set_node_attributes(G, partition, 'community_id') # draw the graph pos = nx.spring_layout(G) node_size = np.array(list(nx.get_node_attributes(G, 'size').values())) * 70 cmap = cm.get_cmap('viridis', max(partition.values()) + 1) fig = plt.figure(figsize=(15, 15)) nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=node_size, cmap=cmap, node_color=list(partition.values())) nx.draw_networkx_edges( G, pos, alpha=0.9, width=1.5, edge_color='k', style='solid', ) labels = nx.draw_networkx_labels(G, pos=pos, font_size=8, font_color='w') plt.title( f'Graph partition using the {LABEL_MEANING[item]} and the Louvain algorithm' ) plt.show() node = nx.get_node_attributes(G, 'item') pprint.pprint(node) df = pd.DataFrame({ node_id: [num_partition, node[node_id]] for node_id, num_partition in partition.items() }).T for g in df.groupby([0]): print(f'N° partition:{g[0]}, items: {g[1][1].to_list()}') del df, dg, src_attr_dict, partition, labels, node return G
def louvain(adata, resolution=None, random_state=0, restrict_to=None, key_added=None, adjacency=None, flavor='vtraag', directed=True, copy=False): """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires to run :func:`~scanpy.api.pp.neighbors`, first. Parameters ---------- adata : :class:`~scanpy.api.AnnData` The annotated data matrix. resolution : `float` or `None`, optional (default: 1) For the default flavor ('vtraag'), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. random_state : `int`, optional (default: 0) Change the initialization of the optimization. restrict_to : `tuple`, optional (default: None) Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain (obs key, list of categories). key_added : `str`, optional (default: 'louvain') Key under which to add the cluster labels. adjacency : sparse matrix or `None`, optional (default: `None`) Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']`. flavor : {'vtraag', 'igraph'} Choose between to packages for computing the clustering. 'vtraag' is much more powerful. copy : `bool` (default: `False`) Copy adata or modify it inplace. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. louvain : `pd.Series` (``adata.obs``, dtype `category`) Array of dim (number of samples) that stores the subgroup id ('0', '1', ...) for each cell. """ logg.info('running Louvain clustering', r=True) adata = adata.copy() if copy else adata if adjacency is None and 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first to compute a neighborhood graph.' ) if adjacency is None: adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to if not isinstance(restrict_categories[0], str): raise ValueError('You need to use strings to label categories, ' 'e.g. \'1\' instead of 1.') for c in restrict_categories: if c not in adata.obs[restrict_key].cat.categories: raise ValueError( '\'{}\' is not a valid category for \'{}\''.format( c, restrict_key)) restrict_indices = adata.obs[restrict_key].isin( restrict_categories).values adjacency = adjacency[restrict_indices, :] adjacency = adjacency[:, restrict_indices] if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warn( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.m(' using the undirected graph', v=4) g = utils.get_igraph_from_adjacency(adjacency, directed=directed) if flavor == 'vtraag': import louvain if resolution is None: resolution = 1 try: logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, louvain.RBConfigurationVertexPartition, resolution_parameter=resolution) # adata.uns['louvain_quality'] = part.quality() except AttributeError: logg.warn('Did not find package louvain>=0.6, ' 'the clustering result will therefore not ' 'be 100% reproducible, ' 'but still meaningful. ' 'If you want 100% reproducible results, ' 'update via "pip install louvain --upgrade".') part = louvain.find_partition(g, method='RBConfiguration', resolution_parameter=resolution) elif flavor == 'igraph': part = g.community_multilevel() groups = np.array(part.membership) elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') unique_groups = np.unique(groups) n_clusters = len(unique_groups) if restrict_to is None: groups = groups.astype('U') key_added = 'louvain' if key_added is None else key_added adata.obs[key_added] = pd.Categorical(values=groups, categories=natsorted( unique_groups.astype('U'))) else: key_added = restrict_key + '_R' if key_added is None else key_added all_groups = adata.obs[restrict_key].astype('U') prefix = '-'.join(restrict_categories) + ',' new_groups = [prefix + g for g in groups.astype('U')] all_groups.iloc[restrict_indices] = new_groups adata.obs[key_added] = pd.Categorical(values=all_groups, categories=natsorted( all_groups.unique())) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = { 'resolution': resolution, 'random_state': random_state } logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint('found {} clusters and added\n' ' \'{}\', the cluster labels (adata.obs, categorical)'.format( n_clusters, key_added)) return adata if copy else None
for i in grp: for j in grp[c + 1:]: weights = df[df['nodes'] == i][j].values[0] all_graphs[lab].add_edge(i, j, weight=weights) c += 1 return # Creates edges between nodes and assignes weights to edges for i in all_graphs.keys(): add_edges_with_weights(all_grps[i], all_dfs[i], i, all_graphs) #%% # Using python-louvain package allows to find best communities(partition) for nodes for i in all_graphs.keys(): partition = community_louvain.best_partition(all_graphs[i]) all_dfs[i]['partition'] = all_dfs[i].nodes.apply( lambda node: partition[node]) #%% def get_nodes_by_partition(df): """ Pulls indices of nodes within the same community(partition) Arguments: df: all_dfs['label'] -> 'label' is written as grp0 to grp43 Returns: Dict of nodes organized by community(partition) """ temp_dict = {}
def train(d1, d2): file_to_machines_dic = {} clean_dict = {} unknown_set = set() file_sha1_to_size = {} fileAndDomain_to_machines_dic = {} data_name = 'Obf_oneInTenWeek1_d' suffix = '.tsv' G = nx.Graph() def add_edge(u, v, w): if G.has_edge(u, v): G[u][v]['weight'] += w else: G.add_edge(u, v, weight=w) for i in range(d1, d2): print('Running data number - {}'.format(i)) data = pd.read_csv(Path().joinpath('data', data_name + str(i) + suffix), sep='\t', error_bad_lines=False, index_col=False, dtype='unicode') data = data.sort_values(by=data.columns[0]) print('num of rows in data', len(data)) # instead of using names we will use sha1 # Number of distinct machines file was downloaded to from this domain. this will be the weight of and edge # name = data.columns[0] start = time.time() # just to know how much time it runs. # fileAndDomain_to_machines_dic key:val -> (key) file&domain : (val) num of machines sha1 = data.columns[3] domain = data.columns[17] threat = data.columns[20] size = data.columns[24] machine = data.columns[13] fileAndDomain_to_machines_dic = {} for index, row in data.iterrows(): file_sha1 = row[sha1] file_domain = row[domain] machine_guid = row[machine] fileAndDomain_to_machines_dic[( file_sha1, file_domain)] = fileAndDomain_to_machines_dic.get( (file_sha1, file_domain), []) + [machine_guid] for index, row in data.iterrows(): file_sha1 = row[sha1] machine_guid = row[machine] file_threat = row[threat] if isinstance(file_threat, str): file_to_machines_dic[file_sha1] = file_to_machines_dic.get( file_sha1, []) + [machine_guid] else: clean_dict[file_sha1] = clean_dict.get(file_sha1, []) + [machine_guid] for index, row in data.iterrows(): file_sha1 = row[sha1] file_size = row[size] file_sha1_to_size[file_sha1] = file_size for key, val in fileAndDomain_to_machines_dic.items(): fileAndDomain_to_machines_dic[key] = len(list(set(val))) fileAndDomain_to_machines_dic = sort_dic(fileAndDomain_to_machines_dic) for (file_sha1, file_domain), weight in fileAndDomain_to_machines_dic.items(): add_edge(file_sha1, file_domain, weight) for key, val in file_to_machines_dic.items(): file_to_machines_dic[key] = len(list(set(val))) for key, val in clean_dict.items(): clean_dict[key] = len(list(set(val))) malicious_dict = {k: v for k, v in file_to_machines_dic.items() if v > 4} clean_dict = {k: v for k, v in clean_dict.items() if v > 8} print('number of malicious files:', len(malicious_dict)) print('number clean files', len(clean_dict)) counter = 0 for key, val in malicious_dict.items(): if key in clean_dict.keys(): del clean_dict[key] counter += 1 print(counter) sha1_set = unknown_set.copy() print('unknown_set before cleaning %d' % len(unknown_set)) for file_sha1 in unknown_set.copy(): if file_sha1 in clean_dict or file_sha1 in malicious_dict: unknown_set.remove(file_sha1) print('unknown_set after cleaning %d' % len(unknown_set)) print("Num of nodes in G {}".format(len(G))) print('Number of edges in G %s' % (G.number_of_edges())) lst = list(G.degree) avg_degree = 0 max_degree = 0 for (item, deg) in lst: if deg > max_degree: max_degree = deg avg_degree += deg print('avg degree:', round(avg_degree / len(G), 2)) print('max deg:', max_degree) # print('Average degree G %s' %(np.mean(nx.degree_histogram(G)))) # now we have a graph G which has a edges between files and the domain it was downloaded from, with weight # which is the number of unique machines which downloaded the file from this domain. # this is just a print out of the weight of each edge. edge_to_weights_dic = nx.get_edge_attributes(G, 'weight') edge_to_weights_dic = sort_dic(edge_to_weights_dic) # for key, value in attr.items(): # print(key, ' : ', value) weight_array = np.array( [edge_to_weights_dic[k] for k in edge_to_weights_dic]) print('average weight: ', weight_array.mean()) print('max weight :', np.amax(weight_array)) # print("len is ", len(attr)) degree_sequence = sorted([d for n, d in G.degree()], reverse=True) # degree sequence file_sha1_to_degree_dict = {} domain_to_degree_dict = {} graph(degree_sequence, "Degree Histogram") for n, d in G.degree(): if n in sha1_set: file_sha1_to_degree_dict[n] = d else: domain_to_degree_dict[n] = d # file graph for degree file_degree_lst = sorted(list(file_sha1_to_degree_dict.values())) domain_degree_list = sorted(list(domain_to_degree_dict.values())) graph(file_degree_lst, "File Degree Histogram") graph(domain_degree_list, "Domain Degree Histogram") partition = community.best_partition(G, weight='weight') partition = sort_dic(partition) domain_per_cluster = {} files_per_cluster = {} for key, val in partition.items(): if key in sha1_set: files_per_cluster[val] = files_per_cluster.get(val, []) + [key] else: domain_per_cluster[val] = domain_per_cluster.get(val, []) + [key] print('total communities :', list(partition.values())[-1]) print('average community size:', len(G) / list(partition.values())[-1]) max_community_size_dict = {} for community_index in partition.values(): max_community_size_dict[community_index] = max_community_size_dict.get( community_index, 0) + 1 max_community_size_dict = sort_dic_rev(max_community_size_dict) print('Max community size:', list(max_community_size_dict.values())[0]) dirty_precent_per_cluster_lst = [] for file_list in files_per_cluster.values(): file_list_len = len(file_list) counter = 0 for file in file_list: if file in malicious_dict.keys(): counter += 1 dirty_precent_per_cluster_lst.append( int(round((counter / file_list_len), 2) * 100)) print(sorted(dirty_precent_per_cluster_lst, reverse=True)) machines_per_cluster = {} file_to_list_of_domains_per_cluster_dic = {} for index, (files_list, domains_list) in enumerate( zip(files_per_cluster.values(), domain_per_cluster.values())): for file in files_list: for domain in domains_list: if G.has_edge(file, domain): file_to_list_of_domains_per_cluster_dic[ file] = file_to_list_of_domains_per_cluster_dic.get( file, []) + [domain] machines_per_cluster[index] = machines_per_cluster.get( index, 0) + G[file][domain]['weight'] machines_per_cluster = sort_dic(machines_per_cluster) # print(*machines_per_cluster.items(), sep='\n') domain_to_dirty_precent = {} cluster_to_file_precent_in_cluster = {} for index, (files_list, domains_list) in enumerate( zip(files_per_cluster.values(), domain_per_cluster.values())): cluster_to_file_precent_in_cluster[index] = len(files_list) / ( len(files_list) + len(domains_list)) for domain in domains_list: domain_total_files_counter = 0 domain_dirty_files_counter = 0 for file in files_list: if G.has_edge(domain, file): domain_total_files_counter += 1 if file in malicious_dict.keys(): domain_dirty_files_counter += 1 # print('%s / %s' % (domain_dirty_files_counter, domain_total_files_counter)) domain_to_dirty_precent[domain] = int( round( (domain_dirty_files_counter / domain_total_files_counter), 2) * 100) domain_to_dirty_precent = sort_dic(domain_to_dirty_precent) # print(*domain_to_dirty_precent.items(), sep='\n') dirty_precent_domains = {} for domain, percent in domain_to_dirty_precent.items(): dirty_precent_domains[percent] = dirty_precent_domains.get(percent, 0) + 1 percent, counter = zip( *dirty_precent_domains.items()) # creating 2 arrays of keys , values fig, ax = plt.subplots(figsize=(8, 8)) plt.bar(percent, counter, color='blue') plt.yscale('log') plt.title("Amount of domains with the number of dirty files percentage") plt.ylabel("Amount of domains") plt.xlabel("dirty file percentage") # ticks = np.arange(0, 105, 5) # ax.set_xticks(ticks) # ax.set_xticklabels(ticks) plt.savefig('graph_dirty_percent_domains.png') plt.show() # the amount of clusters with different amount of machines values_to_machines = {} for key, val in machines_per_cluster.items(): values_to_machines[val] = values_to_machines.get(val, 0) + 1 print(*values_to_machines.items(), sep='\n') fig, ax = plt.subplots(figsize=(9, 9)) # values_to_machines = sort_dic(values_to_machines) t = np.arange(0., len(values_to_machines), 1) y = [val for val in values_to_machines.values()] plt.plot(t, y, 'r') plt.xlabel('Values') plt.yscale('symlog') plt.ylabel('Amount of clusters for value X') ticks = np.arange(1, 223, 20) ax.set_xticks(ticks) ax.set_xticklabels(ticks) plt.savefig('values_to_machines') plt.show() dirty_per_percent_dict = {} for percent in dirty_precent_per_cluster_lst: dirty_per_percent_dict[percent] = dirty_per_percent_dict.get( percent, 0) + 1 dirty_per_percent_dict = { k: dirty_per_percent_dict[k] for k in sorted(dirty_per_percent_dict) } percent, cnt = zip(*dirty_per_percent_dict.items()) fig, ax = plt.subplots(figsize=(8, 8)) plt.bar(percent, cnt, color='green') plt.yscale('symlog') plt.title("Clusters dirty percentage Histogram") plt.ylabel("Amount of clusters with 'x' dirty files percentage") plt.xlabel("percetage") plt.savefig('graph_dirty_percent_clusters.png') plt.show() # feature_ extraction cluster_number_to_malicious_percent_dic = { index: percent for index, percent in enumerate(dirty_precent_per_cluster_lst) } cluster_per_file = {} for cluster_index, files_list in files_per_cluster.items(): for file_sha1 in files_list: cluster_per_file[file_sha1] = cluster_index final_dic = {} for file_sha1, num_of_guid in clean_dict.items(): final_dic[file_sha1] = [ file_sha1, int(file_sha1_to_size[file_sha1]), num_of_guid, cluster_to_file_precent_in_cluster[cluster_per_file[file_sha1]], len(file_to_list_of_domains_per_cluster_dic[file_sha1]), cluster_per_file[file_sha1], cluster_number_to_malicious_percent_dic[ cluster_per_file[file_sha1]], max_community_size_dict[cluster_per_file[file_sha1]], 0 ] for file_sha1, num_of_guid in malicious_dict.items(): final_dic[file_sha1] = [ file_sha1, int(file_sha1_to_size[file_sha1]), num_of_guid, cluster_to_file_precent_in_cluster[cluster_per_file[file_sha1]], len(file_to_list_of_domains_per_cluster_dic[file_sha1]), cluster_per_file[file_sha1], cluster_number_to_malicious_percent_dic[ cluster_per_file[file_sha1]], max_community_size_dict[cluster_per_file[file_sha1]], 1 ] train_X = [] train_y = [] for file_sha1, features in final_dic.items(): train_X.append(features[:-1]) train_y.append(final_dic[file_sha1][-1]) return train_X, train_y, final_dic
# variables globales inputf = sys.argv[1] similarity = sys.argv[2] weighting = sys.argv[3] tokenization = sys.argv[4] alpha = float(sys.argv[5]) outputf = sys.argv[6] # leer documentos documents = [] f = open(inputf, "r") l = f.readline() while l: l = l.rstrip('\r\n') documents.append(l) l = f.readline() # generate graph G = getsimmatrix() print 'calling community detection algorithm' import community community.alpha = alpha clustering = community.best_partition(G) f = open(outputf, 'w') for i in range(len(documents)): f.write(str(clustering[i]) + '\n') f.close()
def generate_community_corpus(self, method=None): if 'number_of_communities' not in self.params.keys(): raise ValueError("the number of topics parameter is missing!") self.number_of_communities = self.params['number_of_communities'] if method == "lda": # Run GibbsLDA++ if not os.path.exists(GIBBSLDA_PATH): raise ValueError("Invalid path of GibbsLDA++!") temp_lda_folder = os.path.join(self.temp_folder, "lda_temp") if not os.path.exists(temp_lda_folder): os.makedirs(temp_lda_folder) temp_dfile_path = os.path.join(temp_lda_folder, "gibblda_temp.dfile") # Save the walks into the dfile self.save_corpus(corpus_file=temp_dfile_path, with_title=True, corpus=self.corpus) initial_time = time.time() cmd = "{} -est ".format(GIBBSLDA_PATH) cmd += "-alpha {} ".format(self.params['lda_alpha']) cmd += "-beta {} ".format(self.params['lda_beta']) cmd += "-ntopics {} ".format(self.params['number_of_communities']) cmd += "-niters {} ".format(self.params['lda_number_of_iters']) cmd += "-savestep {} ".format(self.params['lda_number_of_iters'] + 1) cmd += "-dfile {} ".format(temp_dfile_path) os.system(cmd) print("-> The LDA algorithm run in {:.2f} secs".format(time.time() - initial_time)) # Read wordmap file id2node = {} temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt") with open(temp_wordmap_path, 'r') as f: f.readline() # skip the first line for line in f.readlines(): tokens = line.strip().split() id2node[int(tokens[1])] = tokens[0] # Read phi file phi = np.zeros(shape=(self.number_of_communities, self.number_of_nodes), dtype=np.float) temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi") with open(temp_phi_path, 'r') as f: for comm, line in enumerate(f.readlines()): for id, value in enumerate(line.strip().split()): phi[comm, int(id2node[id])] = value # Read the tassign file, generate topic corpus temp_tassing_path = os.path.join(temp_lda_folder, "model-final.tassign") self.topic_corpus = [] with smart_open(temp_tassing_path, 'r') as f: for line in f: tokens = line.strip().split() self.topic_corpus.append([token.split(':')[1] for token in tokens]) return phi elif method == "hmm": y = [] for walk in self.corpus: seq = [] for w in walk: #seq.append(self.number_of_nodes+np.random.choice(self.number_of_communities)) seq.append(int(w)) y.append(seq) E = self.number_of_nodes K = self.number_of_communities L = self.params['walk_length'] hmm_number_of_iters = self.params['hmm_number_of_iters'] hmm_subset_size = self.params['hmm_subset_size'] N = len(y) plates_multiplier = N / hmm_subset_size p0 = self.params['hmm_p0'] # a vector of size K t0 = self.params['hmm_t0'] # a vector of size K e0 = self.params['hmm_e0'] p_param = p0*np.ones(K, dtype=np.float) p = bayes.Dirichlet(p_param, name='p') t_param = t0*np.ones(K, dtype=np.float) T = bayes.Dirichlet(t_param, plates=(K, ), name='T') e_param = e0*np.ones(E, dtype=np.float) E = bayes.Dirichlet(e_param, plates=(K, ), name='E') z = bayes.CategoricalMarkovChain(p, T, states=L, plates=(hmm_subset_size,), plates_multiplier=(plates_multiplier,), name='Z') x = bayes.Mixture(z, bayes.Categorical, E, name='X') p.initialize_from_random() T.initialize_from_random() E.initialize_from_random() Q = VB(x, z, E, T, p) """ x.observe(y) Q.update(repeat=1000) """ Q.ignore_bound_checks = True delay = 1 forgetting_rate = 0.5 for iter in range(hmm_number_of_iters): # Observe a random mini-batch subset = np.random.choice(a=N, size=hmm_subset_size) # print(subsets) # print() # print(subsets[subset]) Q['X'].observe([y[inx] for inx in subset]) # Learn intermediate variables Q.update('Z') # Set step length step = (iter + delay) ** (-forgetting_rate) # Stochastic gradient for the global variables Q.gradient_step('p', 'T', 'E', scale=step) likelihood = Q['E'].random() qp = p.random() qT = T.random() qE = E.random() self.topic_corpus = [] model = hmm.MultinomialHMM(n_components=self.number_of_communities, tol=0.001, n_iter=5000) model.startprob_ = qp model.emissionprob_ = qE model.transmat_ = qT initial_time = time.time() seq_for_hmmlearn = np.concatenate([np.asarray(seq).reshape(-1, 1).tolist() for seq in y]) seq_lens = [self.params['walk_length'] for _ in range(N)] comm_conc_seq = model.predict(seq_for_hmmlearn, seq_lens) print("The hidden states are predicted in {} secs.".format(time.time() - initial_time)) self.topic_corpus = [] for i in range(N): self.topic_corpus.append([str(w) for w in comm_conc_seq[i*L:(i+1)*L]]) return likelihood elif method == "bigclam": # Run AGM if not os.path.exists(BIGCLAM_PATH): raise ValueError("Invalid path of BigClam!") # If the temp folder for BigClam does not exits temp_bigclam_folder = os.path.join(self.temp_folder, "bigclam_temp") if not os.path.exists(temp_bigclam_folder): os.makedirs(temp_bigclam_folder) g = nx.Graph() g.add_edge(2, 3) print("graph {}".format([g.copy()])) # Get all connected components cc_list = np.asarray(list(nx.connected_component_subgraphs(self.graph))) num_of_cc = cc_list.shape[0] print("graph {}".format([self.graph.copy()])) if num_of_cc == 1: cc_list = [self.graph.copy()] print(cc_list) print("Number of connected components: {}".format(num_of_cc)) cc_sizes = [cc.number_of_nodes() for cc in cc_list] # Sort the connected components cc_sizes_inx = np.argsort(cc_sizes)[::-1] cc_sizes = [cc_sizes[inx] for inx in cc_sizes_inx] cc_list = [cc_list[inx] for inx in cc_sizes_inx] # Find how many communities will be assigned for each connected component cum_sum_cc_sizes = np.cumsum(cc_sizes) # Find the community assignments of the set of the largest 'cc_inx_limit' connected components # in which the ratio of sizes of the smallest connected component and the size of the set is greater than # 1.5 times the number of communities which is desired to be assigned cc_inx_limit = 0 for limit in range(num_of_cc): if cc_sizes[cc_inx_limit] / float(cum_sum_cc_sizes[cc_inx_limit]) >= (1.5 / self.number_of_communities): cc_inx_limit += 1 comm2node = [] temp_bigclam_output = [[] for _ in range(cc_inx_limit)] temp_bigclam_edgelist = [[] for _ in range(cc_inx_limit)] temp_bigclam_labels = [[] for _ in range(cc_inx_limit)] assignment_sizes = np.zeros(shape=cc_inx_limit, dtype=np.int) correction_sizes = np.zeros(shape=cc_inx_limit, dtype=np.int) for cc_index in range(num_of_cc): current_ccg = cc_list[cc_index] if cc_index >= cc_inx_limit: comm2node.append([v for v in current_ccg.nodes()]) else: assignment_sizes[cc_index] = int((float(cc_sizes[cc_index]) / cum_sum_cc_sizes[cc_inx_limit-1]) * self.params['number_of_communities']) temp_bigclam_output[cc_index] = os.path.join(temp_bigclam_folder, "output{}".format(cc_index)) temp_bigclam_edgelist[cc_index] = os.path.join(temp_bigclam_folder, "temp{}.edgelist".format(cc_index)) temp_bigclam_labels[cc_index] = os.path.join(temp_bigclam_folder, "temp{}.labels".format(cc_index)) cc_graph_nodes = sorted([int(node) for node in current_ccg.nodes()]) with open(temp_bigclam_edgelist[cc_index], 'w') as f: for node in cc_graph_nodes: for nb in sorted([int(val) for val in nx.neighbors(current_ccg, str(node))]): if int(node) < int(nb): f.write("{}\t{}\n".format(str(node), str(nb))) with open(temp_bigclam_labels[cc_index], 'w') as f: for node in cc_graph_nodes: f.write("{}\t{}\n".format(str(node), str(node))) cmd = "{} ".format(BIGCLAM_PATH) cmd += "-o:{} ".format(temp_bigclam_output[cc_index]) cmd += "-i:{} ".format(temp_bigclam_edgelist[cc_index]) cmd += "-l:{} ".format(temp_bigclam_labels[cc_index]) cmd += "-nt:{} ".format(8) cmd += "-c:{} ".format(assignment_sizes[cc_index]) os.system(cmd) # Read the output file with open(temp_bigclam_output[cc_index], 'r') as f: for line in f.readlines(): comm2node.append(line.strip().split()) correction_sizes[cc_index] += 1 total_num_of_assigned_communities = len(comm2node) phi = np.zeros(shape=(total_num_of_assigned_communities, self.number_of_nodes), dtype=np.float) self.number_of_communities = total_num_of_assigned_communities # Generate the phi matrix for k in range(total_num_of_assigned_communities): for node in comm2node[k]: phi[k, int(node)] = 1.0 # Be sure that every node is assigned to at least one community for node in range(self.number_of_nodes): # if a node is not assigned to any community if np.sum(phi[:, node]) == 0.0: # Check the assignments of neighbors of the node nb_comm_assign_counts = np.zeros(total_num_of_assigned_communities, dtype=np.float) for nb in nx.neighbors(self.graph, str(node)): nb_comm_assign_counts += phi[:, int(nb)] # If the neighbors of the node is assigned to a community, assign it to the most frequent community if nb_comm_assign_counts.sum() != 0.0: assigned_comm_id = nb_comm_assign_counts.argmax() # Otherwise assign it to a random community else: assigned_comm_id = np.random.choice(a=total_num_of_assigned_communities) phi[assigned_comm_id, node] = 1.0 # Normalize phi = np.divide(phi.T, np.sum(phi, 1)).T # Generate the topic corpus self.topic_corpus = [] for walk in self.corpus: community_walk = [] for w in walk: # If the vertex has only one community assignment if np.sum(phi[:, int(w)] > 0.0) == 1: community_walk.append(str(np.where(phi[:, int(w)] > 0)[0][0])) # otherwise, ... else: # if it is possible, assign it to the community which the previous node is assigned to if len(community_walk) > 0 and phi[int(community_walk[-1]), int(w)] > 0.0: community_walk.append(str(community_walk[-1])) # if not, randomly choose a node else: chosen_comm = np.random.choice(a=phi.shape[0], p=phi[:, int(w)]/np.sum(phi[:, int(w)])) community_walk.append(str(chosen_comm)) self.topic_corpus.append(community_walk) print("---< Summary >---") print("+ The graph consists of {} connected component(s)".format(num_of_cc)) for i in range(cc_inx_limit): print("+ The component of size {} is assigned to {}/{} communities".format(cc_sizes[i], correction_sizes[i], assignment_sizes[i])) print("+ Each of the remaining {} components is assigned to a unique label".format(num_of_cc-cc_inx_limit)) print("+ The 'phi' matrix contains {} communities".format(phi.shape[0])) print("----o----") return phi elif method == "louvain": c = louvain.best_partition(self.graph) self.number_of_communities = len(set(c.values())) print("The number of detected communities: {}".format(self.number_of_communities)) phi = np.zeros(shape=(self.number_of_communities, self.number_of_nodes), dtype=np.float) for node in self.graph.nodes(): phi[int(c[node]), int(node)] = 1.0 self.topic_corpus = [] for walk in self.corpus: seq = [str(c[str(w)]) for w in walk] self.topic_corpus.append(seq) # Normalize phi = (phi.T / np.sum(phi, 1)).T return phi else: raise ValueError("Invalid community/topic detection method")
def create_communities(GRAPHS, num_graphs, graph_header, input_mode, output_filename_prefix): # open community name proposal file proposal_file = open(proposal_filename, encoding='utf-8') proposal_reader = csv.reader(proposal_file, delimiter=',') next(proposal_reader) proposal_list = list(proposal_reader) # create community map output file map_filename = "./output/" + output_filename_prefix + "_community_map.csv" map_file = open(map_filename, mode='w', newline='\n', encoding='utf-8') map_writer = csv.writer(map_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) map_writer.writerow(["Modularity Class", graph_header, "Color Name", "Color Value", "Community Name", "Possible Community Names", "Distinct Words"]) if input_mode == "twitter": import tweet_input as proj_input elif input_mode == "docu": import docu_input as proj_input else: print("Invalid input mode. Exiting...") exit() color_list = proj_input.load_color_list() cur_highest = -1 # first offset should be zero # for each graph for i in range(num_graphs): G = GRAPHS[i] community_info = [] # compute for the communities community_dictionary = community.best_partition(G, partition=None, weight='weight', resolution=1.0, randomize=True, random_state=None) print("Done with best partition algorithm in graph \"" + G.graph['name'] + "\".") # get community information from the dictionary returned by community.best_partition() num_communities = 0 # get the community number and degree for each node for node in G.nodes(): # community number, node label, and degree (for top 5) community_info.append([community_dictionary[node], G.node[node]['label'], G.degree(node)]) # take note of the last community number if community_dictionary[node] > num_communities: num_communities = community_dictionary[node] num_communities += 1 # total number of communities print("There are " + str(num_communities) + " communities in graph \"" + G.graph['name'] + "\".") # determine the community name (either by proposal or top 5) # community_map should contain [0]community name, [1]possible community names (top 5), [2]number of distinct words community_map = name_communities(community_info, proposal_list) community_map = proj_input.color_map(G.graph['name'], community_map, color_list) # store community number, community name, and color to the graph # adjust community number based on previous graph class_offset = cur_highest + 1 # for each node in the graph for node in G.nodes(): # adjust and add community number to the node as an attribute temp = community_dictionary[node] + class_offset G.node[node]['modularity_class'] = temp # add community name G.node[node]['community_name'] = community_map[community_dictionary[node]][0] # color the node G.node[node]['color_name'] = community_map[community_dictionary[node]][3] G.node[node]['color_value'] = community_map[community_dictionary[node]][4] # if it is the highest community number, take note (to be used on the next graph) if temp > cur_highest: cur_highest = temp # save community map map_size = len(community_map) for j in range(map_size): # Modularity Class, Graph Name, Color Name, Color Value, Community Name, Possible Community Names, Distinct Words map_writer.writerow([class_offset + j, G.graph['name'], community_map[j][3], community_map[j][4], community_map[j][0], community_map[j][1], community_map[j][2]]) # store the graph back to the list GRAPHS[i] = G return GRAPHS
def modularity_maximization(self, partition=None): """Perform louvain's method for modularity maximization contained in 'community' library Parameters ---------- partition: dict, optional the algorithm will start using this partition of the nodes. It's a dictionary where keys are their nodes and values the communities (doc taken from 'community' documentation http://perso.crans.org/aynaud/communities/api.html#community.best_partition) Returns ------- comm_list: list community list where indexes correspond to 0:users_num , in sorted order by user id users_num+1: tags_num, in sorted order by tag id tags_num+1: links_num, in sorted order by link id matching: list of dictionaries *_id: community label dict for each type, Users, Tags, Links node_tags: list of str order of partites, as found in hyperedges q: float modularity Raises ------ NetworkXError: If the graph is not Eulerian. (doc taken from 'community' documentation http://perso.crans.org/aynaud/communities/api.html#community.best_partition) """ #temporary class change for function to work self.__class__ = nx.Graph d = community.best_partition(self, partition=partition) q = community.modularity(d, self) self.__class__ = Tripal_nx matching = {'U': {}, 'T': {}, 'L': {}} for h_e in d.keys(): u, t, l = map(int, re.split('U|T|L', h_e)[0:3]) if u in matching['U'].keys(): matching['U'][u].add(d[h_e]) else: matching['U'][u] = set([d[h_e]]) if t in matching['T'].keys(): matching['T'][t].add(d[h_e]) else: matching['T'][t] = set([d[h_e]]) if l in matching['L'].keys(): matching['L'][l].add(d[h_e]) else: matching['L'][l] = set([d[h_e]]) for t in matching: for k in matching[t]: matching[t][k] = list(matching[t][k]) comm_list = [] for t in matching: for k in matching[t]: matching[t][k] = list(matching[t][k]) comm_list.extend( [matching[t][n] for n in sorted(matching[t].keys())]) return comm_list, matching, self.node_tags, q
if edgeIn[0] == '*Vertices' or edgeIn[0] == '*Edges': continue else: edge = (int(edgeIn[0]), int(edgeIn[1])) edges.append(edge) graph = nx.Graph() graph.add_edges_from(edges) print(f'INFO: Completed in {time.perf_counter() - tStart:.4f} secs') print('INFO: Total edges', len(edges)) partition = {} if computePartition: print('INFO: Computing best community partition...', partitionFile) tStart = time.perf_counter() partition = community_louvain.best_partition(graph) print(f'INFO: Completed in {time.perf_counter() - tStart:.4f} secs') if outClusterInfoFile is not None: with open(outClusterInfoFile, 'w') as fp: for k in partition.keys(): print(k - vertexIdOffset, partition[k], file=fp) else: print('INFO: Reading cluster partition from', mtxFile) # read partition files for i in range(1, nodes + 1): # start from index 1 if nodes == 1: parFile = partitionFile else:
def get_node_list(corrcoef, threshold=4): r"""Calculate community structures from interaction network. The interaction network is built using the correlation coeffient matrix, in which the edges are the Pearson correlation of the two connecting nodes. The community structures of this network is calculated using the Louvain algorithm [1]_, which find high modularity network partitions. The modularity is defined as [2]_: .. math:: Q=\frac{1}{2 m} \sum_{i, j}\left[A_{i j}-\frac{k_{i} k_{j}}{2 m}\right] \delta\left(c_{i}, c_{j}\right) where :math:`A_{i j}` is the weight of the edge between node i and node j; :math:`k_{i}` is the sum of weights of the nodes attached to the node i, i.e. the degree of the node; :math:`c_{i}` is the community to which node i assigned; :math:`\delta\left(c_{i}, c_{j}\right)` is 1 if i=j and 0 otherwise; and :math:`m=\frac{1}{2} \sum_{i j} A_{i j}` is the number of edges. In the modularity optimization, the Louvain algorithm orders the nodes in the network, and then, one by one, removes and inserts each node in a different community c_i until no significant increase in modularity. After modularity optimization, all the nodes that belong to the same community are merged into a single node, of which the edge weights are the sum of the weights of the comprising nodes. This optimization-aggregation loop is iterated until all nodes are collapsed into one. By default, this method returns communities containing at least 4 nodes. This setting can be changed by using the parameter ``threshold``. Parameters ----------- corrcoef : ndarray(n, n) The Pearson correlation matrix. threshold : int, default=4 Size of communities. Only communities with more nodes than the threshold will be returned. Returns -------- node_list : list of lists A list of community nodes. modularity : float or None The modularity of network partition. It measure the quality of network partition. The value is between 1 and -1. The bigger the modularity, the better the partition. References ---------- .. [1] Blondel, V. D.; Guillaume, J.-L.; Lambiotte, R.; Lefebvre, E., Fast unfolding of communities in large networks. Journal of Statistical Mechanics: Theory and Experiment 2008, 2008 (10), P10008 .. [2] Newman, M. E. J., Analysis of weighted networks. Physical Review E 2004, 70 (5), 056131. """ # TODO: check negative values in corrcoef_matrix. Come up with better solutions. corrcoef[ corrcoef < 0.0] = 0.0 # network edge can't take negative values. Residues with # negative correlationsare are forced to separate to different binding sites. graph = nx.Graph(corrcoef) partition = community.best_partition(graph, weight='weight') values = [partition.get(node) for node in graph.nodes()] node_list = [] for value in range(max(values)): nodes = [k for k, v in partition.items() if v == value] if len(nodes) >= threshold: node_list.append(nodes) if len(node_list) > 0: modularity = community.modularity(partition, graph) else: modularity = None return node_list, modularity
G_task_noreg = nx.read_adjlist(fNet_task_noreg, nodetype=int) # rest (absence of task) fNet_rest = 'DataTaskNetwork/fMRI_covertverb_r_bp_reg_Rt2_K200_deg20_rest.adjlist' G_rest = nx.read_adjlist(fNet_rest, nodetype=int) # consolidating all into a list G_list = [G_task_reg, G_task_noreg, G_rest] listLabel = [ 'During task\n(task regressed out)', 'During task\n(task NOT regressed out)', 'Rest\n(absence of task)' ] ####### Community detection # Community detection with the Louvain method partition_list = [] for iG in G_list: partition = community.best_partition(iG) partition_list.append(partition) ###### visualizing the modular organization # dictionary of xy-coordinates pos = {} for iROI in range(len(nodes)): pos[nodes[iROI]] = xyz[iROI, :2] # loop over networks for visualization plt.figure(figsize=[10, 4]) for i, iG in enumerate(G_list): plt.subplot(1, 3, i + 1) nComm = max([comm for comm in partition_list[i].values()]) + 1 node_color_list = get_cmap(nComm + 1, 'rainbow')
def clusterGraphData(G): labels = community_louvain.best_partition(G, resolution=2.0,random_state=8) clusterLabels=np.array(list(labels.items()))[0:60000,1] return clusterLabels
def python_louvain(df, resolution, randomize=None, random_state=None): G = nx.from_pandas_edgelist(df = df, source = 'from', target = 'to', edge_attr = 'weight') partition = community.best_partition(graph=G, resolution=resolution, weight='weight', randomize=randomize, random_state= random_state) louvain_dfr = pd.DataFrame.from_dict(data=partition, orient='index') return(louvain_dfr)
def louvain_algorithm(sub_graph): partition = community.best_partition(sub_graph) c_to_node = defaultdict(set) for k, v in partition.items(): c_to_node[v].add(k) return list(c_to_node.values())
def ChangeCommunityColorAndInstantiateHierarchy(self, level=-1): self.g = self.Graphwidget.Graph_data().DrawHighlightedGraph( self.Graphwidget.EdgeSliderValue) self.ColorNodesBasedOnCorrelation = False self.partition = cm.best_partition(self.g) self.induced_graph = cm.induced_graph(self.partition, self.g) if not (level == -1): dendo = cm.generate_dendrogram(self.g) g = cm.partition_at_level(dendo, level) self.induced_graph1 = cm.induced_graph(g, self.g) self.partition = g self.induced_graph = self.induced_graph1 # Induced graph is the data structure responsible for the adjacency matrix of the community # Matrix Before calculating the correlation strength # finding out the lower half values of the matrix, can discard other values as computationally intensive # self.Find_InterModular_Edge_correlativity() self.Matrix = nx.to_numpy_matrix(self.induced_graph) # Triggering a new window with the same color # If the Gray out option is clicked then gray out the nodes without the colors self.ColorForCommunities(len(set(self.partition.values()))) self.ColorForVisit(self.partition) nodes1 = [ item for item in self.Graphwidget.scene().items() if isinstance(item, Node) ] count = 0 for community in set(self.partition.values()): #Ensuring the right color to the right community is delivered list_nodes = [ nodes for nodes in self.partition.keys() if self.partition[nodes] == community ] for node in nodes1: if node.counter - 1 in list_nodes: node.PutColor(self.clut[count]) count = count + 1 for node in nodes1: node.allnodesupdate() break clut = self.clut Max = self.Graphwidget.Max Graph = self.Graphwidget Matrix = self.Matrix ma = np.ma.masked_equal(Matrix, 0.0) Min1 = ma.min() Max1 = Matrix.max() Pos = self.Find_Initial_Positions() """ Generates a new window so that you can access the views related to community analysis """ def newwindow(): for i in reversed(range(self.Graphwidget.hbox.count())): self.Graphwidget.hbox.itemAt(i).widget().close() community = CommunityWidget( self.Graphwidget, self.induced_graph, self.Graphwidget.correlationTableObject, clut, Max, Matrix, ma, Min1, Max1, Pos) Dendogram = dendogram(self.Graphwidget, self.g, clut) self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0) self.Graphwidget.hbox.addWidget(community) self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0) self.Graphwidget.hbox.addWidget(Dendogram) self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0) self.communityObject = community self.dendogramObject = Dendogram self.Graphwidget.hbox.setContentsMargins(0, 0, 0, 0) self.Graphwidget.wid.setContentsMargins(0, 0, 0, 0) self.Graphwidget.wid.setLayout(self.Graphwidget.hbox) newwindow() self.Graphwidget.CommunityColorAndDict.emit(self.ColorToBeSentToVisit, self.partition)
def fit(self, ds: loompy.LoomConnection) -> None: logging.info(f"Running cytograph on {ds.shape[1]} cells") if self.config.params.factorization not in ["PCA", "HPF", "both"]: raise ValueError( "params.factorization must be either 'PCA' or 'HPF' or 'both'") if self.config.params.features not in ["enrichment", "variance"]: raise ValueError( "params.features must be either 'enrichment' or 'variance'") if self.config.params.nn_space not in ["PCA", "HPF", "auto"]: raise ValueError( "params.nn_space must be either 'PCA' or 'HPF' or 'auto'") if not ((self.config.params.nn_space in ["PCA", "auto"] and self.config.params.factorization in ["PCA", "both"]) or (self.config.params.nn_space in ["HPF", "auto"] and self.config.params.factorization in ["HPF", "both"])): raise ValueError( f"config.params.nn_space = '{self.config.params.nn_space}' is incompatible with config.params.factorization = '{self.config.params.factorization}'" ) species = Species.detect(ds) logging.info(f"Species is '{species.name}'") logging.info("Recomputing the list of valid genes") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = (nnz > 10) & (nnz < ds.shape[1] * 0.6) ds.ra.Valid = valid_genes.astype('int') # Perform Poisson pooling if requested main_layer = "" if "poisson_pooling" in self.config.steps: logging.info( f"Poisson pooling with k_pooling == {self.config.params.k_pooling}" ) main_layer = "pooled" # if not in config.steps, use the main layer pp = PoissonPooling(self.config.params.k_pooling, self.config.params.n_genes, compute_velocity=False, n_threads=self.config.execution.n_cpus, factorization=self.config.params.factorization, batch_keys=self.config.params.batch_keys) pp.fit_transform(ds) # Select features if self.config.params.features == "enrichment": logging.info( f"Feature selection by enrichment on preliminary clusters") with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=NumbaPerformanceWarning ) # Suppress warnings about numba not being able to parallelize code warnings.simplefilter( "ignore", category=NumbaPendingDeprecationWarning ) # Suppress warnings about future deprecations warnings.simplefilter( "ignore", category=SparseEfficiencyWarning ) # Suppress warnings about setting the diagonal to 1 logging.info(f" Gene selection for PCA") genes = FeatureSelectionByVariance( self.config.params.n_genes, mask=Species.mask(ds, self.config.params.mask)).fit(ds) logging.info(f" Factorization by PCA") normalizer = Normalizer(False) normalizer.fit(ds) logging.info(" PCA projection to %d components", self.config.params.n_factors) pca = PCA(genes, max_n_components=self.config.params.n_factors, layer=main_layer, test_significance=False, batch_keys=self.config.params.batch_keys) transformed = pca.fit_transform(ds, normalizer) logging.info( f" Computing KNN (k={self.config.params.k}) in PCA space") nn = NNDescent(data=transformed, metric="euclidean") indices, distances = nn.query(transformed, k=self.config.params.k) indices = indices[:, 1:] distances = distances[:, 1:] knn = sparse.csr_matrix( (np.ravel(distances), np.ravel(indices), np.arange(0, distances.shape[0] * distances.shape[1] + 1, distances.shape[1])), (transformed.shape[0], transformed.shape[0])) g = nx.from_scipy_sparse_matrix(knn) partitions = community.best_partition(g, resolution=1, randomize=False) ds.ca.Clusters = np.array( [partitions[key] for key in range(knn.shape[0])]) n_labels = ds.ca.Clusters.max() + 1 genes = FeatureSelectionByEnrichment( int(self.config.params.n_genes // n_labels), Species.mask(ds, self.config.params.mask), findq=False).select(ds) elif self.config.params.features == "variance": logging.info(f"Feature selection by variance") genes = FeatureSelectionByVariance( self.config.params.n_genes, main_layer, Species.mask(ds, self.config.params.mask)).select(ds) logging.info(f"Selected {genes.sum()} genes") if self.config.params.factorization in ['PCA', 'both']: logging.info(f"Factorization by PCA") normalizer = Normalizer(False) normalizer.fit(ds) n_components = min(self.config.params.n_factors, ds.shape[1]) logging.info(" PCA projection to %d components", n_components) pca = PCA(genes, max_n_components=n_components, layer=main_layer, test_significance=False, batch_keys=self.config.params.batch_keys) ds.ca.PCA = pca.fit_transform(ds, normalizer) if self.config.params.factorization in ['HPF', 'both']: logging.info(f"Factorization by HPF") # Load the data for the selected genes data = ds[main_layer].sparse(rows=genes).T logging.debug(f" Data shape is {data.shape}") # HPF factorization hpf = HPF(k=self.config.params.n_factors, validation_fraction=0.05, min_iter=10, max_iter=200, compute_X_ppv=False, n_threads=self.config.execution.n_cpus) hpf.fit(data) beta_all = np.zeros((ds.shape[0], hpf.beta.shape[1])) beta_all[genes] = hpf.beta # Save the unnormalized factors ds.ra.HPF_beta = beta_all ds.ca.HPF_theta = hpf.theta # Here we normalize so the sums over components are one, because JSD requires it # and because otherwise the components will be exactly proportional to cell size theta = (hpf.theta.T / hpf.theta.sum(axis=1)).T beta = (hpf.beta.T / hpf.beta.sum(axis=1)).T beta_all[genes] = beta # Save the normalized factors ds.ra.HPF = beta_all ds.ca.HPF = theta if "nn" in self.config.steps or "clustering" in self.config.steps: if self.config.params.nn_space in ["PCA", "auto" ] and "PCA" in ds.ca: transformed = ds.ca.PCA metric = "euclidean" elif self.config.params.nn_space in ["HPF", "auto" ] and "HPF" in ds.ca: transformed = ds.ca.HPF metric = "js" logging.info( f"Computing balanced KNN (k = {self.config.params.k}) in {self.config.params.nn_space} space using the '{metric}' metric" ) bnn = BalancedKNN(k=self.config.params.k, metric=metric, maxl=2 * self.config.params.k, sight_k=2 * self.config.params.k, n_jobs=-1) bnn.fit(transformed) knn = bnn.kneighbors_graph(mode='distance') knn.eliminate_zeros() mknn = knn.minimum(knn.transpose()) # Convert distances to similarities max_d = knn.data.max() knn.data = (max_d - knn.data) / max_d mknn.data = (max_d - mknn.data) / max_d ds.col_graphs.KNN = knn ds.col_graphs.MKNN = mknn mknn = mknn.tocoo() mknn.setdiag(0) # Compute the effective resolution d = 1 - knn.data radius = np.percentile(d, 90) logging.info(f" 90th percentile radius: {radius:.02}") ds.attrs.radius = radius inside = mknn.data > 1 - radius rnn = sparse.coo_matrix( (mknn.data[inside], (mknn.row[inside], mknn.col[inside])), shape=mknn.shape) ds.col_graphs.RNN = rnn if "embeddings" in self.config.steps or "clustering" in self.config.steps: logging.info(f"Computing 2D and 3D embeddings from latent space") metric_f = ( jensen_shannon_distance if metric == "js" else metric ) # Replace js with the actual function, since OpenTSNE doesn't understand js logging.info(f" Art of tSNE with {metric} distance metric") ds.ca.TSNE = np.array( art_of_tsne(transformed, metric=metric_f) ) # art_of_tsne returns a TSNEEmbedding, which can be cast to an ndarray (its actually just a subclass) logging.info(f" UMAP with {metric} distance metric") ds.ca.UMAP = UMAP(n_components=2, metric=metric_f, n_neighbors=self.config.params.k // 2, learning_rate=0.3, min_dist=0.25).fit_transform(transformed) ds.ca.UMAP3D = UMAP(n_components=3, metric=metric_f, n_neighbors=self.config.params.k // 2, learning_rate=0.3, min_dist=0.25).fit_transform(transformed) if "clustering" in self.config.steps: logging.info("Clustering by polished Louvain") pl = PolishedLouvain(outliers=False, graph="RNN", embedding="TSNE") labels = pl.fit_predict(ds) ds.ca.ClustersModularity = labels + min(labels) ds.ca.OutliersModularity = (labels == -1).astype('int') logging.info("Clustering by polished Surprise") ps = PolishedSurprise(graph="RNN", embedding="TSNE") labels = ps.fit_predict(ds) ds.ca.ClustersSurprise = labels + min(labels) ds.ca.OutliersSurprise = (labels == -1).astype('int') if self.config.params.clusterer == "louvain": ds.ca.Clusters = ds.ca.ClustersModularity ds.ca.Outliers = ds.ca.OutliersModularity else: ds.ca.Clusters = ds.ca.ClustersSurprise ds.ca.Outliers = ds.ca.OutliersSurprise logging.info(f"Found {ds.ca.Clusters.max() + 1} clusters") if species.name in ["H**o sapiens", "Mus musculus"]: logging.info(f"Inferring cell cycle") CellCycleAnnotator(species).annotate(ds)
def communityLayoutCalculation(self, Layout, g): self.g = g if not (self.Graphwidget.ColorNodesBasedOnCorrelation): partition = cm.best_partition(self.g) size = float(len(set(partition.values()))) induced_graph = cm.induced_graph(partition, self.g) if not (self.Graphwidget.level == -1): dendo = cm.generate_dendrogram(self.g) g = cm.partition_at_level(dendo, self.Graphwidget.level) partition = g self.ColorForCommunities(len(set(partition.values()))) if (Layout == "circular") or (Layout == "shell") or (Layout == "random") \ or (Layout == "fruchterman_reingold_layout") or (Layout == "spring") or (Layout == "spectral"): if (Layout == "spring"): if self.Graphwidget.First: self.Graphwidget.First = False neewPos = nx.spring_layout(self.g, weight='weight', k=0.55, iterations=20, scale=500) pos = neewPos else: neewPos = nx.spring_layout(self.g, pos=self.pos, weight='weight', scale=500) pos = neewPos count = 0 Factor = 1 elif (Layout == "random") or (Layout == "shell") or (Layout == "neato"): neewPos = eval('nx.' + Layout + '_layout(self.g)') pos = neewPos Factor = 2000 else: neewPos = eval('nx.' + Layout + '_layout(self.g,scale=1000)') pos = neewPos Factor = 1 if not (self.Graphwidget.ColorNodesBasedOnCorrelation): self.Graphwidget.ColorNodesBasedOnCorrelation = False if not (self.Graphwidget.level == -1): self.ChangeCommunityColorAndInstantiateHierarchy( self.Graphwidget.level - 1) else: self.ChangeCommunityColorAndInstantiateHierarchy() else: if Layout != "circo": pos = nx.nx_pydot.graphviz_layout( self.g, prog=Layout, args='-Gsep=.25,-GK=20-Eweight=2') Factor = 0.7 + self.counter / 100 if Layout == 'sfdp': Factor = 1 else: pos = nx.nx_pydot.graphviz_layout(self.g, prog=Layout) Factor = 0.7 if not (self.Graphwidget.ColorNodesBasedOnCorrelation): self.Graphwidget.ColorNodesBasedOnCorrelation = False if not (self.Graphwidget.level == -1): self.ChangeCommunityColorAndInstantiateHierarchy( self.Graphwidget.level - 1) else: self.ChangeCommunityColorAndInstantiateHierarchy() self.pos = pos return pos, Factor
alpha=0.8) nx.draw_networkx_nodes( g, pos, nodelist=[node for node in g.nodes() if found[int(node)] == 0], node_color='b', node_size=100, alpha=0.8) nx.draw_networkx_edges(g, pos, width=1.0, alpha=0.5) ''' plt.show() ''' gt_node2comm = nx.get_node_attributes(g, 'community') correct_labels = [gt_node2comm[str(node)] for node in range(N)] pred_labels = [found[node] for node in range(N)] nmi = normalized_mutual_info_score(correct_labels, pred_labels) print("NMI: {}".format(nmi)) found2 = community.best_partition(graph=g) pred_labels = [found2[str(node)] for node in range(N)] nmi = normalized_mutual_info_score(correct_labels, pred_labels) print("Louvain NMI: {}".format(nmi))