def test_karate_club(): nodes, edges = PyLouvain.from_file("data/karate.txt") pyl = PyLouvain(nodes, edges) partition, q = pyl.apply_method(gamma=1.0) odds = bayes_model_selection(nodes, edges, partition) print(partition, q, odds)
def test(graphname, gnc=None): nodes, edges = PyLouvain.from_file("data/%s.txt" % graphname) pyl = PyLouvain(nodes, edges) name_pickle = 'fig/save_%s_%d.p' % (graphname, len(nodes)) if not os.path.isfile(name_pickle): print("pickle file", name_pickle, "is missing. Recompute.") start = time.time() partition, q = pyl.apply_method() print("Modularity Time", time.time() - start) start = time.time() partition2 = multiscale(nodes, edges, 0.5) print("Multiscale Time", time.time() - start) results = {"LV": partition, "MS": partition2} sizes_distri = { "Modularity": [len(p) for p in partition], "MultiScale": [len(p) for p in partition2] } pickle.dump(results, open(name_pickle, 'wb')) print("Pickle save", name_pickle) else: print("pickle file", name_pickle, "is found.") results = pickle.load(open(name_pickle, "rb")) sizes_distri = { "Modularity": [len(p) for p in results["LV"]], "MultiScale": [len(p) for p in results["MS"]] } if gnc: gnc_fp = open(gnc, "r+") gnc_map = {} sizes_distri["Ground Truth"] = [] for i, line in enumerate(gnc_fp): x = line.split() sizes_distri["Ground Truth"].append(len(x)) for j in x: gnc_map[int(j)] = i gnc_list = [gnc_map[k] for k in nodes] lv_map = {v: i for i, c in enumerate(partition) for v in c} lv_list = [lv_map[k] for k in nodes] ms_map = {v: i for i, c in enumerate(partition2) for v in c} ms_list = [ms_map[k] for k in nodes] print("Louvain NMI=", normalized_mutual_info_score(lv_list, gnc_list)) print("Multi-scale NMI=", normalized_mutual_info_score(ms_list, gnc_list)) hist(sizes_distri, graphname)
def test_small_networks(nodes, edges, gamma0): pyl = PyLouvain(nodes, edges) partition0, q0 = pyl.apply_method(gamma0) c0 = cmap(nodes, partition0) NMI = [] gamma_list = np.linspace(0.2, 3.5, num=200) for gamma in gamma_list: partition, q = PyLouvain(nodes, edges).apply_method(gamma) c = cmap(nodes, partition) NMI.append(metrics.normalized_mutual_info_score(c0, c)) plt.plot(gamma_list, NMI, 'b-*', markersize=10) plt.show()
def test_karate_club(self): pyl = PyLouvain.from_file("data/karate.txt") partition, q = pyl.apply_method() q_ = q * 10000 self.assertEqual(4, len(partition)) self.assertEqual(4298, math.floor(q_)) self.assertEqual(4299, math.ceil(q_))
def findGZCommunity(): dbm = dbManager2('sina11', host='127.0.0.1', passwd='root') pyl = PyLouvain.from_db(dbm, "select uid,fid from afrelation11 limit 0,10000") partition, q = pyl.apply_method() values = [] j = 0 print partition f = open("output.txt", "w") f.write(str(partition)) f.close()
def test_football(): # load GNC ground truth by txt file (as defined by conference) fconf = open("data/football.gnc.txt", "r") gnc = {str(i): int(line.strip()) for i, line in enumerate(fconf)} order_ = {i: stri for i, stri in enumerate(sorted(gnc.keys()))} x, y, z, r = [], [], [], [] for gamma in np.linspace(0.5, 8.5, num=35): nodes, edges = PyLouvain.from_file("data/football.txt") pyl = PyLouvain(nodes, edges) partition, q = pyl.apply_method(gamma) odds = bayes_model_selection(nodes, edges, partition) print(len(partition), odds) x.append(gamma) y.append(odds) z.append(len(partition)) comm = {n: i for i, ns in enumerate(partition) for n in ns} a = [comm[i] for i in nodes] b = [gnc[order_[i]] for i in nodes] #print("NMI=", metrics.adjusted_mutual_info_score(a, b)) r.append(metrics.adjusted_mutual_info_score(a, b)) #r.append(metrics.adjusted_rand_score(a, b)) plt.plot(x, y, 'r-*', markersize=10) ax1 = plt.gca() ax1.tick_params(axis='x', labelsize=18) ax1.tick_params(axis='y', labelcolor='r', labelsize=15) ax2 = ax1.twinx() ax2.plot(x, z, 'm-^', markersize=10) ax2.tick_params(axis='y', labelcolor='m', labelsize=15) plt.tight_layout() plt.savefig("fig/football2.png")
def do(path, source_data): start_time = time.time() print 'start time: ',start_time print 'louvain算法社团划分开始...' pyl = PyLouvain.from_file(path, source_data) partition, q = pyl.apply_method() # print partition out_file = open(path+"community_result.txt", 'w') #读取节点信息文件 nodes_file = open(path+'nodes_tmp.txt', 'r') nodes_lines = nodes_file.readlines() nodes_file.close() nodes = {} #保存 节点序号-节点名称 信息 for line in nodes_lines: n = line.split() if not n: break nodes[n[1]] = n[0] #社团信息 格式:标记 成员数 print '统计社团信息 写入社团状态文件' community_status = open(path+'community_status.txt', 'w') i = 1 label = {} # 格式 节点名称-社团标记 for community in partition: community_status.write(str(i)+'\t'+str(len(community))+'\n') #社团标记 成员数 #成员贴上社团标记 for per in community: label[nodes[str(per)]] = str(i) i += 1 community_status.close() #关联用户互动 print '关联用户互动数据 写入结果文件' relationship_file = open(path+source_data, 'r') relationship_lines = relationship_file.readlines() relationship_file.close() for rela in relationship_lines: r = rela.split() if not r: break out_file.write('-\t'+r[0]+'\t'+label[r[0]]+'\t'+r[1]+'\t'+label[r[1]]+'\t'+r[2]+'\n') out_file.close() print 'end time: ',time.time() print '花费时间: ',(time.time()-start_time)/60,' min'
def test_football2(): for gamma in np.linspace(0.4, 0.9, num=10): print() print("gamma=", gamma) nodes, edges = PyLouvain.from_file("data/football.txt") partition = multiscale(nodes, edges, gamma) # load GNC ground truth by txt file (as defined by conference) fconf = open("data/football.gnc.txt", "r") gnc = {str(i): int(line.strip()) for i, line in enumerate(fconf)} order_ = {i: stri for i, stri in enumerate(sorted(gnc.keys()))} comm = {n: i for i, ns in enumerate(partition) for n in ns} a = [comm[i] for i in nodes] b = [gnc[order_[i]] for i in nodes] print("NMI=", metrics.adjusted_mutual_info_score(a, b)) #test_football2()
def test_citations(self): pyl = PyLouvain.from_file("data/hep-th-citations") partition, q = pyl.apply_method()
def test_lesmis(self): pyl = PyLouvain.from_gml_file("data/lesmis.gml") partition, q = pyl.apply_method()
def test_arxiv(self): pyl = PyLouvain.from_file("data/facebook_combined.txt", 0.5) partition, q = pyl.apply_method() print(len(partition), q)
def test_arxiv(self): pyl = PyLouvain.from_file("data/arxiv.txt") partition, q = pyl.apply_method()
def test_citations(): nodes, edges = PyLouvain.from_file("data/hep-th-citations") pyl = PyLouvain(nodes, edges) partition, q = pyl.apply_method() print(partition, q)
def main(): if not has_pyimpfuzzy: sys.exit("[!] pyimpfuzzy must be installed for this script.") if not has_py2neo: sys.exit("[!] py2neo must be installed for this script.") if not has_pylouvain and not args.nocluster: sys.exit("[!] Please download the pylouvain from https://github.com/patapizza/pylouvain.") try: graph_http = "http://" + NEO4J_USER + ":" + NEO4J_PASSWORD +"@:" + NEO4J_PORT + "/db/data/" GRAPH = Graph(graph_http) except: sys.exit("[!] Can't connect Neo4j Database.") if len(sys.argv) == 1: parser.print_help() sys.exit() i= 0 hashlist = [] hashlist_new = [] nodes = [] edges = [] relationships = [] # This is a impfuzzys threshold if args.threshold: ss_threshold = args.threshold else: ss_threshold = 30 print("[*] Impfuzzy threshold is %i." % ss_threshold) # Delete database data if args.delete: GRAPH.delete_all() print("[*] Delete all nodes and relationships from this Neo4j database.") # Load database data database = GRAPH.data("MATCH (m:Malware) RETURN m.id, m.name, m.impfuzzy, m.md5, m.sha1, m.sha256") if database: print("[*] Database nodes %d." % len(database)) for d in database: hashlist.append([d["m.id"], d["m.name"], d["m.impfuzzy"], d["m.md5"], d["m.sha1"], d["m.sha256"]]) nodes_count = len(database) # Load relationships relation_data = GRAPH.data("MATCH (m1:Malware)-[s:same]-(m2:Malware) RETURN m1.id,m2.id,s.value") if relation_data: print("[*] Database relationships %d." % len(relation_data)) for r in relation_data: relationships.append([r["m1.id"], r["m2.id"], r["s.value"]]) for x in range(nodes_count): nodes.append(x) print("[*] Creating a graph data.") # Import data from EXE or DLL if args.file: if os.path.isfile(args.file): i = nodes_count impfuzzy, md5, sha1, sha256 = get_digest(args.file) query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256 if impfuzzy: if not GRAPH.data(query): nodes.append(i) hashlist_new.append([i, args.file, impfuzzy, md5, sha1, sha256]) else: print("[!] This malware is registered already. sha256: %s" % sha256) else: print("[!] Can't calculate the impfuzzy hash. sha256: %s" % sha256) else: sys.exit("[!] Can't open file {0}.".format(args.file)) # Import data from directory if args.directory: try: files = os.listdir(args.directory) except OSError: sys.exit("[!] Can't open directory {0}.".format(args.directory)) outf = args.directory + "_hash.csv" fl = open(outf, "w") i = nodes_count for file in files: filename = args.directory + "/" + file impfuzzy, md5, sha1, sha256 = get_digest(filename) fl.write("%s,%s,%s,%s,%s\n" % (file, impfuzzy, md5, sha1, sha256)) query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256 if impfuzzy: if not GRAPH.data(query) and sha256 not in [x[5] for x in hashlist_new]: nodes.append(i) hashlist_new.append([i, file, impfuzzy, md5, sha1, sha256]) i += 1 else: print("[!] This malware is registered already. sha256: %s" % sha256) else: print("[!] Can't calculate the impfuzzy hash. sha256: %s" % sha256) print("[*] Created hash list %s." % outf) fl.close() # Import data from csv file if args.listname: print("[*] Parse file %s." % args.listname) try: csvfile = csv.reader(open(args.listname), delimiter=",") except IOError: sys.exit("[!] Can't open file {0}.".format(args.listname)) i = nodes_count for array in csvfile: query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % array[4] if array[1]: if not GRAPH.data(query): nodes.append(i) array.insert(0, i) hashlist_new.append(array) i += 1 else: print("[!] This malware is registered already. sha256: %s" % array[4]) else: print("[!] Impfuzzy hash is blank. sha256: %s" % array[4]) # Compare impfuzzy print("[*] The total number of malware is %i." % i) result_list = impfuzzy_comp(hashlist, hashlist_new) if len(database) != len(nodes): # Clustering if not args.nocluster: for edge in result_list + relationships: if edge[2] > ss_threshold: edges.append([[edge[0], edge[1]], edge[2]]) else: edges.append([[edge[0], edge[1]], 0]) pyl = PyLouvain(nodes, edges) partition, modularity = pyl.apply_method() print("[*] The number of clusters is %i." % (len(partition) - 1)) else: print("[*] No clustering option.") # Create node tx = GRAPH.begin() if args.nocluster: for hash in hashlist_new: tx.append(statement_c, {"id": hash[0], "name": hash[1], "impfuzzy": hash[2], "md5": hash[3], "sha1": hash[4], "sha256": hash[5], "cluster": "NULL"}) else: for hash in hashlist_new + hashlist: i=0 for a in partition: i=i+1 if hash[0] in a: tx.append(statement_c, {"id": hash[0], "name": hash[1], "impfuzzy": hash[2], "md5": hash[3], "sha1": hash[4], "sha256": hash[5], "cluster": i}) # Create relationship for result in result_list: if result[2] > ss_threshold: tx.append(statement_r, {"id1": result[0], "id2": result[1], "value": result[2]}) tx.process() tx.commit() print("[*] Created a graph data.\n") else: print("[*] Not find a new malware.\n") print(" Access to http://localhost:7474 via Web browser.") print(" Use Cypher query. You can see the graph.\n") print(" == Cypher Query Examples ==") print(" [Visualizing the all clusters]") print(" $ MATCH (m:Malware) RETURN m\n") print(" [Visualizing the clusters that matches the MD5 hash]") print(" $ MATCH (m1:Malware)-[s]-() WHERE m1.md5 = \"[MD5]\"") print(" MATCH (m2:Malware) WHERE m2.cluster = m1.cluster") print(" RETURN m2\n") print(" [Visualizing the clusters that matches the threshold more than 90]") print(" $ MATCH (m:Malware)-[s:same]-() WHERE s.value > 90 RETURN m,s") print(" ===========================\n")
from pylouvain import PyLouvain import math from matplotlib import pyplot as plt import networkx as nx filepath = 'out.txt' # 获取社区划分 pyl = PyLouvain.from_file(filepath) node_dict = pyl.node_dict # key是253916-2的形式,value是编号的形式 reverse_node_dict = dict(zip(node_dict.values(), node_dict.keys())) # key是编号的形式,value是253916-2的形式 partition, q = pyl.apply_method() print(partition) print("模块度:", q) # 给各个社区节点分配颜色 community_num = len(partition) print('community_num:', community_num) color_board = ['red', 'green', 'blue', 'pink', 'orange', 'purple', 'scarlet'] color = {} for index in range(community_num): print("社区" + str(index + 1) + ":" + str(len(partition[index]))) for node_id in partition[index]: color[node_id] = color_board[ index] # color为一个字典,key为编号形式的节点,value为所属社区的颜色 new_color_dict = sorted(color.items(), key=lambda d: d[0], reverse=False) # 将color字典按照key的大小排序,并返回一个list node_list = [reverse_node_dict[item[0]] for item in new_color_dict] #存储编号从小到大顺序对应的253916-2的形式的节点 color_list = [item[1] for item in new_color_dict] #存储node_list中对应的节点颜色
def test_test(self): pyl = PyLouvain.from_file("data/year1990.txt") partition, q = pyl.apply_method() return partition
import matplotlib from matplotlib import pyplot as plt from sklearn import metrics from pylouvain import PyLouvain from sklearn import metrics import numpy as np nodes, edges = PyLouvain.from_file("data/karate.txt") gamma0 = 0.78 #nodes, edges = PyLouvain.from_gml_file("data/lesmis.gml") #nodes, edges = PyLouvain.from_gml_file("data/polbooks.gml") def cmap(nodes, partition): m = {n: i for i, _ in enumerate(partition) for n in _} return [m[i] for i in nodes] def test_small_networks(nodes, edges, gamma0): pyl = PyLouvain(nodes, edges) partition0, q0 = pyl.apply_method(gamma0) c0 = cmap(nodes, partition0) NMI = [] gamma_list = np.linspace(0.2, 3.5, num=200) for gamma in gamma_list: partition, q = PyLouvain(nodes, edges).apply_method(gamma) c = cmap(nodes, partition) NMI.append(metrics.normalized_mutual_info_score(c0, c))
def test_polbooks(): nodes, edges = PyLouvain.from_gml_file("data/polbooks.gml") pyl = PyLouvain(nodes, edges) partition, q = pyl.apply_method() print(partition, q)
import matplotlib matplotlib.use("Agg") from matplotlib import pyplot as plt import pickle import time import numpy as np from sklearn import metrics from pylouvain import PyLouvain from run import multiscale, bayes_model_selection x, y, z = [], [], [] for gamma in np.linspace(0.2, 0.9, num=20): print("gamma=", gamma) pyl = PyLouvain.from_file("data/football.txt") partition = multiscale(pyl.nodes, pyl.edges, gamma) # load GNC ground truth by txt file (as defined by conference) fconf = open("data/football.gnc.txt", "r") gnc = {str(i): int(line.strip()) for i, line in enumerate(fconf)} order_ = {i: stri for i, stri in enumerate(sorted(gnc.keys()))} comm = {n: i for i, ns in enumerate(partition) for n in ns} a = [comm[i] for i in pyl.nodes] b = [gnc[order_[i]] for i in pyl.nodes] x.append(gamma) y.append(len(partition)) z.append(metrics.adjusted_mutual_info_score(a, b)) print("#comm=", len(partition), "NMI=", metrics.adjusted_mutual_info_score(a, b))
def process(self): hashlist = [] hashlist_new = [] nodes = [] edges = [] relationships = [] # recover all actual data database = self.graph.run( "MATCH (m:Malware) RETURN m.id, m.name, m.impfuzzy, m.scout_result, m.scout_confidence, m.md5, m.sha1, m.sha256, m.tag" ).data() if database: for d in database: hashlist.append([ d["m.id"], d["m.name"], d["m.impfuzzy"], d["m.scout_result"], d["m.scout_confidence"], d["m.md5"], d["m.sha1"], d["m.sha256"], d["m.tag"] ]) nodes_count = len(database) i = nodes_count relation_data = self.graph.run( "MATCH (m1:Malware)-[s:same]-(m2:Malware) RETURN m1.id, m2.id, s.value" ).data() if relation_data: for r in relation_data: relationships.append([r["m1.id"], r["m2.id"], r["s.value"]]) for x in range(nodes_count): nodes.append(x) # if massive check for each file if self.folder_path: for item in self.files: scout_result, impfuzzy, md5, sha1, sha256, scout_confidence = self.get_digest( item[0]) if scout_result in ("", 'A171', None): continue query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256 objs = self.graph.run(query).data() if not objs and sha256 not in [x[5] for x in hashlist_new]: nodes.append(i) hashlist_new.append([ i, item[0].split("/")[-1], impfuzzy, scout_result, scout_confidence, md5, sha1, sha256, item[1] ]) i += 1 else: continue else: # if single we are in the reporting module # if file is tested it need to have valid apiscout vector if self.check_file(self.filepath): scout_result, impfuzzy, md5, sha1, sha256, scout_confidence = self.get_digest( self.filepath) if scout_result in ("", 'A171', None): return {} else: return {} query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256 objs = self.graph.run(query).data() if not objs: nodes.append(nodes_count) hashlist_new.append([ nodes_count, self.filename, impfuzzy, scout_result, scout_confidence, md5, sha1, sha256, None ]) else: return self.search_hash(sha256) # Calculate apiscout correlation result_list = self.scout_comp(hashlist, hashlist_new) if len(database) != len(nodes): for edge in result_list + relationships: if edge[2] > self.threshold: edges.append([[edge[0], edge[1]], edge[2]]) else: edges.append([[edge[0], edge[1]], 0]) pyl = PyLouvain(nodes, edges) partition, modularity = pyl.apply_method() # Create node tx = self.graph.begin() for hash in hashlist_new + hashlist: i = 0 for a in partition: i += 1 if hash[0] in a: tx.append( statement_c, { "id": hash[0], "name": hash[1], "impfuzzy": hash[2], "scout_result": hash[3], "scout_confidence": hash[4], "md5": hash[5], "sha1": hash[6], "sha256": hash[7], "tag": hash[8], "cluster": i }) # Create relationship for result in result_list: if result[2] > self.threshold: tx.append(statement_r, { "id1": result[0], "id2": result[1], "value_scout": result[2] }) tx.process() tx.commit() # recover info if self.filename: return self.search_hash(sha256)
def test_polbooks(self): pyl = PyLouvain.from_gml_file("data/polbooks.gml") partition, q = pyl.apply_method()
def multiscale(nodes, edges, gamma, deg=None, depth=1, verbose=False, max_depth=4): ''' Multi-scale community detection. Recursively split sub-graph by maximizing generalized modularity. Terminate at each level of recursion by bayes model selection. Args: nodes: a list of nodes edges: a list of edges ((src, dst), weight) gamma: the resolution parameter for the generalized modularity Return: a list of lists, each contains the nodes in a community ''' if depth >= max_depth or len(nodes) < 2: return [nodes] verbose and print(" " * depth, "***", "depth=", depth, "N=", len(nodes)) nodes.sort() d = {n: i for i, n in enumerate(nodes)} rd = {i: n for n, i in d.items()} nodes = list(range(len(d))) edges = [((d[e[0][0]], d[e[0][1]]), e[1]) for e in edges] if deg is None: deg = {i: 0 for i in nodes} for e in edges: deg[e[0][0]] += e[1] deg[e[0][1]] += e[1] pyl = PyLouvain(nodes, edges, deg) # execution partition, q = pyl.apply_method(gamma) verbose and print(" " * depth, "gamma=", gamma, "comm=", len(partition)) if len(partition) < 2: return [list(map(rd.get, nodes))] odds = bayes_model_selection(pyl.nodes, pyl.edges, partition) verbose and print(" " * depth, "odds=", odds) if odds <= 1. or math.isnan(odds): return [list(map(rd.get, nodes))] comm = {n: i for i, ns in enumerate(partition) for n in ns} edge_list = [[] for _ in range(len(partition))] for e in edges: u, v = e[0][0], e[0][1] if comm[u] == comm[v]: edge_list[comm[u]].append(e) R = [] for nodes_, edges_ in zip(partition, edge_list): groups = multiscale(nodes_, edges_, gamma, deg, depth + 1, verbose, max_depth) for grp in groups: R.append([rd[n] for n in grp]) return R
end = time.time() commsFG_sizes = sorted([len(commsFG[i]) for i in range(len(commsFG))]) verbose and print(commsFG_sizes) map_comm = {v:i for i, c in enumerate(commsFG) for v in c} a = [map_comm[k] for k in G.nodes()] print("FastGreedy Algorithm ARI=", adjusted_rand_score(a, gnc_list), "NMI=", normalized_mutual_info_score(a, gnc_list)) print("which takes", end - start, "seconds") ''' #=========== Benchmark Louvain ===============# print("Start Louvain community detection") start = time.time() pyl = PyLouvain(nodes, edges) commsLV, q = pyl.apply_method(1.0) end = time.time() commsLV_sizes = sorted([len(commsLV[i]) for i in range(len(commsLV))]) verbose and print(commsLV_sizes) verbose and print(len(commsLV_sizes)) map_comm = {v: i for i, c in enumerate(commsLV) for v in c} LV_list = [map_comm[k] for k in G.nodes()] print("Louvain Algorithm ARI=", adjusted_rand_score(LV_list, gnc_list), "NMI=", normalized_mutual_info_score(LV_list, gnc_list)) print("which takes", end - start, "seconds") print("Size range = ", min(commsLV_sizes), max(commsLV_sizes)) print()