def augmentNodes(g): r1 = nx.eigenvector_centrality_numpy(g) r2 = nx.degree_centrality(g) # DP MY r3 = nx.betweenness_centrality(g) r5 = nx.load_centrality(g,weight='weight') # DY, WY-writename # Scientific collaboration networks: II. Shortest paths, weighted networks, and centrality, M. E. J. Newman, Phys. Rev. E 64, 016132 (2001). r6 = nx.pagerank(g, alpha=0.85, personalization=None, max_iter=100, tol=1e-08, nstart=None, weight='weight') if nx.is_directed(g) == True: r8 = nx.in_degree_centrality(g) r9 = nx.out_degree_centrality(g) # r10 = nx.hits(g, max_iter=100, tol=1e-08, nstart=None) else: r4 = nx.communicability_centrality(g) r7 = nx.clustering(g, weight='weight') for x in g.nodes(): g.node[x]['eigenvector_centrality_numpy'] = r1[x] g.node[x]['degree_centrality'] = r2[x] g.node[x]['betweenness_centrality'] = r3[x] g.node[x]['load_centrality'] = r5[x] g.node[x]['pagerank'] = r6[x] if nx.is_directed(g) == True: g.node[x]['in_degree_centrality'] = r8[x] g.node[x]['out_degree_centrality'] = r9[x] # g.node[x]['hits'] = r10[x] else: g.node[x]['communicability_centrality'] = r4[x] g.node[x]['clustering'] = r7[x] return g
def centrality(net): values ={} close = nx.closeness_centrality(net, normalized= True) eigen = nx.eigenvector_centrality_numpy(net) page = nx.pagerank(net) bet = nx.betweenness_centrality(net,normalized= True) flow_c = nx.current_flow_closeness_centrality(net,normalized= True) flow_b = nx.current_flow_betweenness_centrality(net,normalized= True) load = nx.load_centrality(net, normalized = True) com_c = nx.communicability_centrality(net) com_b = nx.communicability_betweenness_centrality(net, normalized= True) degree = net.degree() file3 = open("bl.csv",'w') for xt in [bet,load,degree,page,flow_b,com_c,com_b,eigen,close,flow_c]:#[impo,bet,flow_b,load,com_c,com_b] : for yt in [bet,load,degree,page,flow_b,com_c,com_b,eigen,close,flow_c]:#[impo,bet,flow_b,load,com_c,com_b] : corr(xt.values(),yt.values(),file3) print file3.write("\n") file3.close() #plt.plot(x,y, 'o') #plt.plot(x, m*x + c, 'r', label='Fitted line') #plt.show() #for key,item in close.iteritems() : #values[key] = [impo.get(key),bet.get(key),flow_b.get(key), load.get(key),com_c.get(key),com_b.get(key)] return values
def centrality(net): values = {} close = nx.closeness_centrality(net, normalized=True) eigen = nx.eigenvector_centrality_numpy(net) page = nx.pagerank(net) bet = nx.betweenness_centrality(net, normalized=True) flow_c = nx.current_flow_closeness_centrality(net, normalized=True) flow_b = nx.current_flow_betweenness_centrality(net, normalized=True) load = nx.load_centrality(net, normalized=True) com_c = nx.communicability_centrality(net) com_b = nx.communicability_betweenness_centrality(net, normalized=True) degree = net.degree() file3 = open("bl.csv", 'w') for xt in [ bet, load, degree, page, flow_b, com_c, com_b, eigen, close, flow_c ]: #[impo,bet,flow_b,load,com_c,com_b] : for yt in [ bet, load, degree, page, flow_b, com_c, com_b, eigen, close, flow_c ]: #[impo,bet,flow_b,load,com_c,com_b] : corr(xt.values(), yt.values(), file3) print file3.write("\n") file3.close() #plt.plot(x,y, 'o') #plt.plot(x, m*x + c, 'r', label='Fitted line') #plt.show() #for key,item in close.iteritems() : #values[key] = [impo.get(key),bet.get(key),flow_b.get(key), load.get(key),com_c.get(key),com_b.get(key)] return values
def features_matrix(graph, anchors, use_dist=True, use_pgrs=True, use_pgr=True, use_comm=False, use_comm_centr=False): node_feats = [] n = len(graph) if use_dist: dists = nx.all_pairs_shortest_path_length(graph) if use_pgr: pageranks = nx.pagerank_numpy(graph) if use_pgrs: pgr_anchor = [anchored_pagerank(graph, anchor) for anchor in anchors] if use_comm_centr: communicability_centrality = nx.communicability_centrality(graph) if use_comm: communicability = nx.communicability(graph) for node in graph.nodes(): assert node == len(node_feats) feats = [] if use_dist: feats += [dists[node][anchor] for anchor in anchors] if use_pgrs: feats += [pgr[node]*n for pgr in pgr_anchor] if use_pgr: feats.append(pageranks[node]*n) if use_comm_centr: feats.append(communicability_centrality[node]) if use_comm: feats += [communicability[node][anchor] for anchor in anchors] node_feats.append(np.array(feats)) return node_feats
def subgraph_centrality(self, graph): x = nx.communicability_centrality(graph) z = 0 y = len(x) for key, value in x.iteritems(): z += value return (z / y)
def run(self, graph, slope): communicability_data = nx.communicability_centrality(graph) weights = {} max_comm_for_normaliz = max(communicability_data.values()) for node, commu in communicability_data.items(): weights[node] = slope[node] * commu / max_comm_for_normaliz return weights
def communicability_centrality_sum(self): if (self.communicability_centrality_dict == None): self.communicability_centrality_dict = nx.communicability_centrality( self.graph) time.sleep(1) return self.communicability_centrality_dict[ self.node_1] + self.communicability_centrality_dict[self.node_2]
def node_communicability_centrality(X): """ based on networkx function: communicability_centrality """ XX = np.zeros((X.shape[0], np.sqrt(X.shape[1]))) for i, value in enumerate(X): adj_mat = value.reshape((np.sqrt(len(value)),-1)) adj_mat = (adj_mat - np.min(adj_mat)) / (np.max(adj_mat) - np.min(adj_mat)) adj_mat = 1 - adj_mat # th = np.mean(adj_mat) - 0.1 # adj_mat = np.where(adj_mat < th, adj_mat, 0.) percent, th, adj_mat, triu = percentage_removed(adj_mat, 0.76) #96 print("percent = {0}, threshold position = {1}, threshold = {2}\n".format(percent, th, triu[th])) g = nx.from_numpy_matrix(adj_mat) print "Graph Nodes = {0}, Graph Edges = {1} ".format(g.number_of_nodes(), g.number_of_edges()) print "\nEdge kept ratio, {0}".format(float(g.number_of_edges())/((g.number_of_nodes()*(g.number_of_nodes()-1))/2)) deg_cent = nx.communicability_centrality(g) node_cent = np.zeros(g.number_of_nodes()) for k in deg_cent: node_cent[k] = deg_cent[k] XX[i] = node_cent print "graph {0} => mean {1}, min {2}, max {3}".format(i, np.mean(XX[i]), np.min(XX[i]), np.max(XX[i])) # XX = XX*100 ss = StandardScaler() XX = ss.fit_transform(XX.T).T return XX
def process(self): for date in self.dates: print "Year of analysis " + str(date) rst = [] with open(self.srcdir + str(date) + '.csv', 'rU') as f: rows = csv.reader(f, dialect='excel', delimiter=';') next(rows, None) for row in rows: self.G.add_node(row[0]) self.G.add_node(row[1]) self.G.add_edge(row[0], row[1], weight=float(row[2])) # self.G.add_edge(row[0], row[1]) # self.G.add_edge(row[0], row[1], weight=abs(float(float(row[3])/float(row[2])))) rst.append(nx.eigenvector_centrality_numpy(self.G)) rst.append(nx.betweenness_centrality(self.G)) rst.append(nx.closeness_centrality(self.G)) rst.append(nx.degree_centrality(self.G)) rst.append(nx.communicability_centrality(self.G)) self._save_result(date, rst) self._save_graph_gexf(date)
def features_dict(graph, anchors, use_dist=True, use_pgrs=True, use_pgr=True, use_comm=False, use_comm_centr=False): node_feats = {} n = len(graph) if use_dist: # dists = nx.all_pairs_shortest_path_length(graph) dists = dists_to_anchors(graph, anchors) if use_pgr: pageranks = nx.pagerank_numpy(graph) if use_pgrs: # pgr_anchor = [anchored_pagerank(graph, anchor) for anchor in anchors] pgr_anchor = pageranks_to_anchors(graph, anchors) if use_comm_centr: communicability_centrality = nx.communicability_centrality(graph) if use_comm: communicability = nx.communicability(graph) for node in graph.nodes(): feats = [] if use_dist: feats += [dists[node][anchor] for anchor in anchors] if use_pgrs: feats += [pgr_anchor[anchor][node]*n for anchor in range(len(anchors))] # feats += [pgr[node]*n for pgr in pgr_anchor] if use_pgr: feats.append(pageranks[node]*n) if use_comm_centr: feats.append(communicability_centrality[node]) if use_comm: feats += [communicability[node][anchor] for anchor in anchors] node_feats[node] = np.array(feats) return node_feats
def communicability_centrality(gnx, f, ft): start = timer.start(ft, 'load_centrality') communicability_centrality_dict = nx.communicability_centrality(gnx) timer.stop(ft, start) for k in communicability_centrality_dict: f.writelines( str(k) + ',' + str(communicability_centrality_dict[k]) + '\n') return communicability_centrality_dict
def communicability_centrality(self): """Return communicability centrality for each node in G. If is the graph is directed it will be converted to undirected. Returns ------- nodes: dictionary Dictionary of nodes with communicability centrality as the value. Examples -------- >>> """ if self.is_directed: return nx.communicability_centrality(self._graph.to_undirected()) else: return nx.communicability_centrality(self._graph)
def calculate(network): try: n = nx.communicability_centrality(network) except: return 0 if len(n.values()) == 0: return 0 else: return round(sum(n.values()) / len(n.values()), 7)
def forUndirected(G): myList = [nx.eigenvector_centrality_numpy(G), nx.degree_centrality(G), nx.betweenness_centrality(G), nx.communicability_centrality(G), nx.load_centrality(G), nx.pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1e-08, nstart=None, weight='weight'), nx.clustering(G, weight='weight')] return myList
def process_data(denom=100000, round=0): f = csv.reader(open("../applab_new_6.csv", 'rb'), delimiter=',') db = nx.DiGraph() full_users = set() i = 0 uniquect = 0 for line in f: if i % 100000 == 0 : print "processed", i, "lines" if i == 1000: break sender, receiver, date, time, duration, cost, location, region = map(lambda x: x.strip(), line) if sender not in full_users: uniquect += 1 full_users.add(sender) if uniquect <= 2: #% denom - round == 0: db.add_node(sender) if db.has_node(receiver) == False: db.add_node(receiver) else: if db.has_node(receiver) == False: db.add_node(receiver) if db.has_edge(sender, receiver): db[sender][receiver]['weight'] += int(duration) else: db.add_edge(sender, receiver, weight=int(duration)) i+=1 #pickle.dump(db, open("users_networkx.p" % str(round), "wb")) #print "degree assortativity coeff:", nx.degree_assortativity_coefficient(db) #print "average degree connectivity:", nx.average_degree_connectivity(db) # print "k nearest neighbors:", nx.k_nearest_neighbors(db) print "calculating deg cent" deg_cent = nx.degree_centrality(db) #sorted(nx.degree_centrality(db).items(), key=lambda x: x[1]) print "calculating in deg cent" in_deg_cent = nx.in_degree_centrality(db) #sorted(nx.in_degree_centrality(db).items(), key=lambda x: x[1]) print "calculating out deg cent" out_deg_cent = nx.out_degree_centrality(db) #sorted(nx.out_degree_centrality(db).items(), key=lambda x: x[1]) print "closeness cent" closeness_cent = nx.closeness_centrality(db) #sorted(nx.closeness_centrality(db).items(), key=lambda x: x[1]) #print "betweenness cent" #btwn_cent = nx.betweenness_centrality(db) #sorted(nx.betweenness_centrality(db).items(), key=lambda x: x[1]) print "done" w = open("../output/user_network_stats.csv", 'w') w.write("uid,deg_cent,in_deg_cent,out_deg_cent,closeness_cent,btwn_cent\n") for user in deg_cent.keys(): try: w.write("%s,%s,%s,%s,%s\n" % (user, deg_cent[user], in_deg_cent[user], out_deg_cent[user], closeness_cent[user])) except: pass w.close() print "drawing..." nx.draw(db) plt.savefig("path.pdf") print "done!" print "edge betweenness centrality:", nx.edge_betweenness_centrality(db) print "communicability:", nx.communicability(db) print "communicability centrality:", nx.communicability_centrality(db)
def centrality(self): """ calculates several measures of node centrality and stores them in the general node table """ speciesnodes = set(n for n, d in self.graph.nodes(data=True) if d['graphics']['type']=='roundrectangle') g2 = nx.Graph(self.graph) self.nodes['degree'] = pandas.Series(nx.degree_centrality(self.graph)) self.nodes['closeness'] = pandas.Series(nx.closeness_centrality(self.graph)) self.nodes['betweenness'] = pandas.Series(nx.betweenness_centrality(self.graph)) self.nodes['communicability'] = pandas.Series(nx.communicability_centrality(g2))
def centrality(self): """ calculates several measures of node centrality and stores them in the general node table """ speciesnodes = set(n for n, d in self.graph.nodes(data=True) if d['graphics']['type'] == 'roundrectangle') g2 = nx.Graph(self.graph) self.nodes['degree'] = pandas.Series(nx.degree_centrality(self.graph)) self.nodes['closeness'] = pandas.Series( nx.closeness_centrality(self.graph)) self.nodes['betweenness'] = pandas.Series( nx.betweenness_centrality(self.graph)) self.nodes['communicability'] = pandas.Series( nx.communicability_centrality(g2))
def calculatecommunicabilitycentrality(network): ''' Communicability centrality, also called subgraph centrality, of a node n is the sum of closed walks of all lengths starting and ending at node n. ''' try: n = nx.communicability_centrality(network) except: return 0 if len(n.values()) == 0: return 0 Q else: return round(sum(n.values())/len(n.values()), 7)
def calculate_centrality(G): # dc_dumps = json.dumps(nx.degree_centrality(G).items(),sort_keys=True,indent=4) # dc_loads = json.loads(dc_dumps) dc_sorted = sorted(nx.degree_centrality(G).items(), key=itemgetter(0), reverse=True) bc_sorted = sorted(nx.betweenness_centrality(G).items(), key=itemgetter(0), reverse=True) clc_sorted = sorted(nx.closeness_centrality(G).items(), key=itemgetter(0), reverse=True) coc_sorted = sorted(nx.communicability_centrality(G).items(), key=itemgetter(0), reverse=True) lc_sorted = sorted(nx.load_centrality(G).items(), key=itemgetter(0), reverse=True) cfbc_sorted = sorted(nx.current_flow_betweenness_centrality(G).items(), key=itemgetter(0), reverse=True) cfcc_sorted = sorted(nx.current_flow_closeness_centrality(G).items(), key=itemgetter(0), reverse=True) # print ec_sorted[0] developer_centrality = [] developer_file = file("public/wordpress/developer.json") developers = json.load(developer_file) for developer in developers: degree = 0 betweenness = 0 closeness = 0 communicability = 0 load = 0 current_flow_betweenness = 0 current_flow_closeness = 0 for i in range (0, len(dc_sorted)): # if ( not dc_sorted[i][0] == bc_sorted[i][0] == clc_sorted[i][0] == coc_sorted[i][0] == lc_sorted[i][0] == cfbc_sorted[i][0]): # print 'false' if( developer['developer'] == dc_sorted[i][0]): degree = dc_sorted[i][1] betweenness = bc_sorted[i][1] closeness = clc_sorted[i][1] communicability = coc_sorted[i][1] load = lc_sorted[i][1] current_flow_betweenness = cfbc_sorted[i][1] current_flow_closeness = cfcc_sorted[i][1] developer_centrality.append({ 'name': developer['developer'], 'degree': degree, 'betweenness': betweenness, 'closeness': closeness, 'communicability': communicability, 'load': load, 'current_flow_betweenness': current_flow_betweenness, 'current_flow_closeness':current_flow_closeness, }) return developer_centrality
def set_capacities_communicability_gravity(topology, capacities, capacity_unit='Mbps'): """ Set link capacities proportionally to the product of the communicability centralities of the two end-points of the link Parameters ---------- topology : Topology The topology to which link capacities will be set capacities : list A list of all possible capacity values capacity_unit : str, optional The unit in which capacity value is expressed (e.g. Mbps, Gbps etc..) """ centrality = nx.communicability_centrality(topology) _set_capacities_gravity(topology, capacities, centrality, capacity_unit)
def cal_communicability_centrality(fn1,fn2): edges=prep.read_edges(fn2) sth={edge:0. for edge in edges} G=nx.Graph() edges_all=prep.read_edges(fn1) G.add_edges_from(edges_all) communicability=nx.communicability_centrality(G) for x in sth: n1= communicability.get(x[0]) n2= communicability.get(x[1]) print n1,n2 n3=max(n1,n2) n4=min(n1,n2) sth[x]=float(n4)/(n3+1) # sth[x]=n1*n2 # sth[x]=n1+n2 return sth
def node_communicability_centrality(X): """ based on networkx function: communicability_centrality """ XX = np.zeros((X.shape[0], np.sqrt(X.shape[1]))) for i, value in enumerate(X): adj_mat = value.reshape((np.sqrt(len(value)), -1)) adj_mat = (adj_mat - np.min(adj_mat)) / (np.max(adj_mat) - np.min(adj_mat)) adj_mat = 1 - adj_mat # th = np.mean(adj_mat) - 0.1 # adj_mat = np.where(adj_mat < th, adj_mat, 0.) print("\n========== Node communicability centrality ==========\n") percent, th, adj_mat, triu = percentage_removed(adj_mat, 0.76) #96 print("percent = {0}, threshold position = {1}, threshold = {2}\n". format(percent, th, triu[th])) g = nx.from_numpy_matrix(adj_mat) print "Graph Nodes = {0}, Graph Edges = {1} ".format( g.number_of_nodes(), g.number_of_edges()) print "\nEdge kept ratio, {0}".format( float(g.number_of_edges()) / ((g.number_of_nodes() * (g.number_of_nodes() - 1)) / 2)) deg_cent = nx.communicability_centrality(g) node_cent = np.zeros(g.number_of_nodes()) for k in deg_cent: node_cent[k] = deg_cent[k] XX[i] = node_cent print "graph {0} => mean {1}, min {2}, max {3}".format( i, np.mean(XX[i]), np.min(XX[i]), np.max(XX[i])) # XX = XX*100 ss = StandardScaler() XX = ss.fit_transform(XX.T).T return XX
def communicativity(G, k): G.remove_edges_from(G.selfloop_edges()) com = nx.communicability_centrality(G) counter = 0 bet_dict = {} for edg in G.edges(): ini = edg[0] fin = edg[1] value = com[ini] + com[fin] bet_dict.update({edg: value}) infected = classify_edges(G) final_dict = {} for val in infected: final_dict.update({val: bet_dict[val]}) sorted_dict = sorted(final_dict.items(), key=operator.itemgetter(1)) counter = 0 iter = 1 print len(sorted_dict) while counter <= k: val = sorted_dict[-iter] edg = val[0] ini = edg[0] fin = edg[1] if G.has_edge(ini, fin): G.remove_edge(ini, fin) counter = counter + 1 iter = iter + 1 return G
def make_net(centrality_name, in_path, out_path): #sample code #import _2_time_based_data_network_feature #make_net_in_path = "../3.time_based_data/1.cite_relation_devide/" #make_net_out_path = "../3.time_based_data/2.centrality_data/" #_2_time_based_data.make_net( "in_degree", make_net_in_path, make_net_out_path) #네트워크를 만들고 Centurality를 계산하고 저장할 것이다. import networkx as nx global Dump Dump = {} make_net_initialize(in_path) start_time = time.time() temp_start_time = time.time() print "============= make_net start:" + centrality_name + " ==============" print "============= from 1951 to 2015 ==============" for year in range(1951, 2016): print year f_in = open(in_path + str(year) + "_cite.csv", "r") lines = f_in.readlines() f_in.close() edge_list = [] for line in lines: data = line.split(",") data_tuple = (data[0].strip(), data[1].strip()) edge_list.append(data_tuple) Net = nx.DiGraph(edge_list) Cen_in = {} if (centrality_name == "in_degree"): Cen_in = nx.in_degree_centrality(Net) elif (centrality_name == "degree"): Cen_in = nx.degree_centrality(Net) elif (centrality_name == "eigenvector"): Cen_in = nx.eigenvector_centrality_numpy(Net) elif (centrality_name == "katz"): Cen_in = nx.katz_centrality(Net) elif (centrality_name == "pagerank"): Cen_in = nx.pagerank(Net) elif (centrality_name == "communicability"): Net = nx.Graph(edge_list) Cen_in = nx.communicability_centrality(Net) elif (centrality_name == "load"): Cen_in = nx.load_centrality(Net) for j in Cen_in: key = j val = Cen_in[j] Dump[key][year] = val #저장하는 코드 f_out = open(out_path + centrality_name + "_centrality.csv", "w") for key in Dump: line = str(key) for year in range(1951, 2016): data = Dump[key].get(year, 0) line = line + "," + str(data) line = line + "\n" f_out.write(line) f_out.close() print "============= make_net end ==============" print(centrality_name + "takes %s seconds" % (time.time() - temp_start_time)) temp_start_time = time.time()
def makeJSON(self): global info info = "" info += "{\n \"info\": {\n\"nodes\": [\n" i = 0 num = len(s_partition) sorted_betweeness = [] sorted_degree = [] sorted_eigenvector = [] sorted_closeness = [] sorted_harmonic = [] sorted_communicability = [] sorted_core = [] sorted_degree1 = [] sorted_partition = s_partition unadjusted_betweeness = [] unadjusted_degree = [] unadjusted_eigenvctor = [] unadjusted_closeness = [] unadjusted_harmonic = [] unadjusted_communicability = [] G.remove_edges_from(G.selfloop_edges()) for key, value in nx.betweenness_centrality(G).items(): value1 = 1 + (value * 100) temp1 = [key, value] temp = [key, value1] sorted_betweeness.append(temp1) unadjusted_betweeness.append(temp1) sorted_partition = sorted(sorted_partition) sorted_betweeness = sorted(sorted_betweeness) unadjusted_betweeness = sorted(unadjusted_betweeness) for key, value in nx.degree_centrality(G).items(): value1 = 1 + (value * 100) temp1 = [key, value] temp = [key, value1] sorted_degree.append(temp1) unadjusted_degree.append(temp1) sorted_degree = sorted(sorted_degree) unadjusted_degree = sorted(unadjusted_degree) for key, value in nx.eigenvector_centrality(G).items(): value1 = value * 1000 temp1 = [key, value] temp = [key, value1] sorted_eigenvector.append(temp1) unadjusted_eigenvctor.append(temp1) sorted_eigenvector = sorted(sorted_eigenvector) unadjusted_eigenvector = sorted(unadjusted_eigenvctor) for key, value in nx.closeness_centrality(G).items(): value1 = (value * 10) temp1 = [key, value] temp = [key, value1] sorted_eigenvector.append(temp1) unadjusted_eigenvctor.append(temp1) sorted_eigenvector = sorted(sorted_eigenvector) for key, value in nx.harmonic_centrality(G).items(): temp1 = [key, value] sorted_harmonic.append(temp1) unadjusted_eigenvctor.append(temp1) sorted_harmonic = sorted(sorted_harmonic) unadjusted_harmonic = sorted(unadjusted_eigenvctor) for key, value in nx.communicability_centrality(G).items(): temp1 = [key, value] sorted_communicability.append(temp1) unadjusted_eigenvctor.append(temp1) sorted_communicability = sorted(sorted_communicability) unadjusted_communicability = sorted(unadjusted_eigenvctor) for key, value in nx.core_number(G).items(): #list temp = [key, value] sorted_core.append(temp) sorted_core = sorted(sorted_core) for key, value in nx.degree(G).items(): #list temp = [key, value] sorted_degree1.append(temp) sorted_degree1 = sorted(sorted_degree1) central_dict = {} unadjusted_dict = {} global importance importance = {} for key, value in sorted_betweeness: central_dict[key] = [] importance[key] = 0 central_dict[key].append(value) for key, value in unadjusted_betweeness: unadjusted_dict[key] = [] unadjusted_dict[key].append(value) for key, value in sorted_degree: central_dict[key].append(value) importance[key] += value for key, value in unadjusted_degree: unadjusted_dict[key].append(value) for key, value in sorted_eigenvector: central_dict[key].append(value) importance[key] + value for key, value in unadjusted_eigenvector: unadjusted_dict[key].append(value) for key, value in sorted_closeness: central_dict[key].append(value) importance[key] += value for key, value in unadjusted_closeness: unadjusted_dict[key].append(value) for key, value in sorted_harmonic: central_dict[key].append(value) importance[key] += value for key, value in unadjusted_harmonic: unadjusted_dict[key].append(value) for key, value in sorted_communicability: central_dict[key].append(value) importance[key] += value for key, value in unadjusted_communicability: unadjusted_dict[key].append(value) for key, value in sorted_core: importance[key] += value for key, value in sorted_degree1: importance[key] += value averages = {} acc = {} totals = {} groups = [] for key, value in sorted_partition: val1 = booleans[key] if val1 not in groups: groups.append(val1) acc[val1] = 0 totals[val1] = 0 for key, value in importance.items(): val1 = booleans[key] for item in groups: if (val1 == item): acc[item] += value totals[item] += 1 for key, value in acc.items(): averages[key] = acc[key] / totals[key] for key, value in sorted_partition: i += 1 val1 = booleans[key] info += "{\"id\": \"" + str(key) + "\", \"group\": " + str( value) + ", \"question\": \"" + str(val1) + "\", " val = unadjusted_dict[key] info += "\"Centrality\": { \"Betweeness\": " + str( val[0]) + ", \"Degree\": " + str( val[1]) + ", \"Eigenvector\": " + str( val[2]) + ", \"Closeness\": " + str( val[3]) + ", \"Harmonic\": " + str( val[4]) + ", \"Communicability\": " + str( val[5]) + " } " if num == i: info += "} \n" else: info += "}, \n" info += "],\n \"links\":[\n" num = len(network) i = 0 partition_dict = dict(sorted_partition) for key, value in sorted(network.items()): i += 1 term = str(key).split(",") group = partition_dict[term[0]] if num == i: info += "{\"source\": \"" + term[ 0] + "\", \"target\": \"" + term[ 1] + "\", \"value\": \"" + str( value) + "\", \"group\": " + str(group) + "}\n" else: info += "{\"source\": \"" + term[ 0] + "\", \"target\": \"" + term[ 1] + "\", \"value\": \"" + str( value) + "\", \"group\": " + str(group) + "},\n" info += "],\n \"Unadjusted_centrality\":[\n" i = 0 num = len(unadjusted_dict) for key, value in unadjusted_dict.items(): i += 1 val = value info += "{\"id\": \"" + str( key) + "\",\"Centrality\": { \"Betweeness\": " + str( val[0]) + ", \"Degree\": " + str( val[1]) + ", \"Eigenvector\": " + str( val[2]) + ", \"Closeness\": " + str( val[3]) + ", \"Harmonic\": " + str( val[4]) + " } " if num == i: info += "} \n" else: info += "}, \n" num = len(importance) i = 0 info += "],\n \"importance\":[\n" for key, value in importance.items(): i += 1 val1 = booleans[key] info += "{\"id\": \"" + str(key) + "\", \"importance\": " + str( value) + ", \"question\": \"" + str(val1) + "\"" if num == i: info += "} \n" else: info += "}, \n" num = len(acc) i = 0 info += "],\n \"group_stats\":[\n" for key, value in acc.items(): i += 1 val2 = averages[key] val3 = totals[key] info += "{\"group\": \"" + str( key) + "\", \"total_importance\": " + str( value) + ", \"average_importance\": " + str( val2) + ", \"total_mentions\": " + str(val3) if num == i: info += "} \n" else: info += "}, \n" num = len(pageRank) i = 0 info += "],\n \"pagerank\":[\n" for key, value in pageRank.items(): i += 1 info += "{\"id\": \"" + str(key) + "\", \"pagerank\": " + str( value) if num == i: info += "} \n" else: info += "}, \n" num = len(booleans) i = 0 question = headers[2] info += "],\n \"question\":[\n" for key, value in booleans.items(): i += 1 info += "{\"id\": \"" + str(key) + "\", \"question\": \"" + str( value) + "\"" if num == i: info += "} \n" else: info += "}, \n" info += "]\n},"
def createGraphFeatures(num_documents, clean_train_documents, unique_words, sliding_window, b, idf_par, centrality_par, centrality_col_par, normalized_centrality): features = np.zeros((num_documents, len(unique_words))) term_num_docs = {} print "Creating the graph of words for collection..." if centrality_col_par == "pagerank_centrality" or centrality_col_par == "out_degree_centrality" or centrality_col_par == "in_degree_centrality" or centrality_col_par == "betweenness_centrality_directed" or centrality_col_par == "closeness_centrality_directed": dGcol = nx.DiGraph() else: dGcol = nx.Graph() totalLen = 0 for i in range(0, num_documents): #dG = nx.Graph() found_unique_words = [] wordList1 = clean_train_documents[i].split(None) wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1] docLen = len(wordList2) totalLen += docLen # print clean_train_documents[i] for k, word in enumerate(wordList2): if word not in found_unique_words: found_unique_words.append(word) if word not in term_num_docs: term_num_docs[word] = 1 else: term_num_docs[word] += 1 for j in xrange(1, sliding_window): try: next_word = wordList2[k + j] if not dGcol.has_node(word): dGcol.add_node(word) dGcol.node[word]['count'] = 1 else: dGcol.node[word]['count'] += 1 if not dGcol.has_node(next_word): dGcol.add_node(next_word) dGcol.node[next_word]['count'] = 0 if not dGcol.has_edge(word, next_word): dGcol.add_edge(word, next_word, weight=1) else: dGcol.edge[word][next_word]['weight'] += 1 except IndexError: if not dGcol.has_node(word): dGcol.add_node(word) dGcol.node[word]['count'] = 1 else: dGcol.node[word]['count'] += 1 except: raise avgLen = float(totalLen) / num_documents print "Number of nodes in collection graph:" + str(dGcol.number_of_nodes()) print "Number of edges in collection graph:" + str(dGcol.number_of_edges()) print "Average document length:" + str(avgLen) print "Number of self-loops for collection graph:" + str( dGcol.number_of_selfloops()) if idf_par == "icw": icw_col = {} dGcol.remove_edges_from(dGcol.selfloop_edges()) nx.write_edgelist(dGcol, "test.edgelist") if centrality_col_par == "degree_centrality": centrality_col = nx.degree_centrality(dGcol) elif centrality_col_par == "pagerank_centrality": centrality_col = pg.pagerank(dGcol) # centrality_col = nx.pagerank(dGcol) elif centrality_col_par == "eigenvector_centrality": centrality_col = nx.eigenvector_centrality(dGcol, max_iter=10000, weight="weight") elif centrality_col_par == "katz_centrality": centrality_col = nx.katz_centrality(dGcol) elif centrality_col_par == "betweenness_centrality" or centrality_col_par == "betweenness_centrality_directed": centrality_col = nx.betweenness_centrality(dGcol) elif centrality_col_par == "triangles": centrality_col = nx.triangles(dGcol) elif centrality_col_par == "clustering_coefficient": centrality_col = nx.clustering(dGcol) elif centrality_col_par == "in_degree_centrality": centrality_col = nx.in_degree_centrality(dGcol) elif centrality_col_par == "out_degree_centrality": centrality_col = nx.out_degree_centrality(dGcol) elif centrality_col_par == "core_number": centrality_col = nx.core_number(dGcol) elif centrality_col_par == "closeness_centrality" or centrality_col_par == "closeness_centrality_directed": centrality_col = nx.closeness_centrality(dGcol, normalized=False) elif centrality_col_par == "communicability_centrality": centrality_col = nx.communicability_centrality(dGcol) centr_sum = sum(centrality_col.values()) for k, g in enumerate(dGcol.nodes()): if centrality_col[g] > 0: icw_col[g] = math.log10( (float(centr_sum)) / (centrality_col[g])) else: icw_col[g] = 0 idf_col = {} for x in term_num_docs: idf_col[x] = math.log10( (float(num_documents) + 1.0) / (term_num_docs[x])) print "Creating the graph of words for each document..." totalNodes = 0 totalEdges = 0 for i in range(0, num_documents): if centrality_par == "pagerank_centrality" or centrality_par == "out_degree_centrality" or centrality_par == "in_degree_centrality" or centrality_par == "betweenness_centrality_directed" or centrality_par == "closeness_centrality_directed": dG = nx.DiGraph() else: dG = nx.Graph() wordList1 = clean_train_documents[i].split(None) wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1] docLen = len(wordList2) if docLen == 2: print wordList2 if docLen > 1 and wordList2[0] != wordList2[1]: # print clean_train_documents[i] for k, word in enumerate(wordList2): for j in xrange(1, sliding_window): try: next_word = wordList2[k + j] if not dG.has_node(word): dG.add_node(word) dG.node[word]['count'] = 1 else: dG.node[word]['count'] += 1 if not dG.has_node(next_word): dG.add_node(next_word) dG.node[next_word]['count'] = 0 if not dG.has_edge(word, next_word): dG.add_edge(word, next_word, weight=1) else: dG.edge[word][next_word]['weight'] += 1 except IndexError: if not dG.has_node(word): dG.add_node(word) dG.node[word]['count'] = 1 else: dG.node[word]['count'] += 1 except: raise dG.remove_edges_from(dG.selfloop_edges()) if centrality_par == "degree_centrality": if normalized_centrality == True: centrality = nx.degree_centrality(dG) elif normalized_centrality == False: centrality = degree_centrality(dG) elif centrality_par == "clustering_coefficient": centrality = nx.clustering(dG) elif centrality_par == "pagerank_centrality": # centrality = pg.pagerank(dG,max_iter=10000) centrality = nx.pagerank(dG) elif centrality_par == "eigenvector_centrality": centrality = nx.eigenvector_centrality(dG, max_iter=10000) elif centrality_par == "katz_centrality": centrality = nx.katz_centrality(dG, normalized=False) elif centrality_par == "betweenness_centrality" or centrality_par == "betweenness_centrality_directed": centrality = nx.betweenness_centrality(dG, normalized=False) elif centrality_par == "triangles": centrality = nx.triangles(dG) elif centrality_par == "in_degree_centrality": if normalized_centrality == True: centrality = nx.in_degree_centrality(dG) elif normalized_centrality == False: centrality = in_degree_centrality(dG) elif centrality_par == "out_degree_centrality": if normalized_centrality == True: centrality = nx.out_degree_centrality(dG) elif normalized_centrality == False: centrality = out_degree_centrality(dG) elif centrality_par == "core_number": centrality = nx.core_number(dG) elif centrality_par == "weighted_centrality": if normalized_centrality == True: centrality = weighted_centrality_normalized(dG) elif normalized_centrality == False: centrality = weighted_centrality(dG) elif centrality_par == "closeness_centrality" or centrality_par == "closeness_centrality_directed": centrality = nx.closeness_centrality(dG, normalized=False) elif centrality_par == "communicability_centrality": centrality = nx.communicability_centrality(dG) totalNodes += dG.number_of_nodes() totalEdges += dG.number_of_edges() #print "Number of self-loops:"+str(dG.number_of_selfloops()) #centrality = nx.out_degree_centrality(dG) #centrality = nx.katz_centrality(dG,max_iter=10000) for k, g in enumerate(dG.nodes()): # Degree centrality (local feature) if g in unique_words: #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g] if idf_par == "no" or idf_par == "yes": features[i, unique_words.index(g)] = centrality[ g] #centrality[g]/(1-b+(b*(float(docLen)/avgLen)))dG.node[g]['count'] elif idf_par == "idf": features[i, unique_words.index( g )] = centrality[g] * idf_col[ g] #(centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g] elif idf_par == "icw": features[i, unique_words.index(g)] = ( centrality[g] / (1 - b + (b * (float(docLen) / avgLen)))) * icw_col[g] print "Average number of nodes:" + str(float(totalNodes) / num_documents) print "Average number of edges:" + str(float(totalEdges) / num_documents) return features
import networkx as nx import numpy import sys import scipy.linalg G=nx.read_adjlist("sfba-inipy.txt",delimiter=" ",nodetype=int) #G=nx.read_adjlist("test.gpfc",delimiter=" ",nodetype=int) sc = nx.communicability_centrality(G) print sc sys.exit (0) # alternative implementation that calculates the matrix exponential nodelist = G.nodes() # ordering of nodes in matrix A = nx.to_numpy_matrix(G,nodelist) # convert to 0-1 matrix A[A!=0.0] = 1 print "A matrix" print A print expA = scipy.linalg.expm(A) print expA print # convert diagonal to dictionary keyed by node sc = dict(zip(nodelist,map(float,expA.diagonal()))) print sc
def compute_network_features(graph, network_name): # this function is used to calculate network features and returns result # as a dictionary: Dict<feature_name,feature_value> # -------------------------------------------------------------------------------- network_features = dict() node_features_list = list() netclass = network_name.split('___') if len(netclass) > 1: network_features['group'] = netclass[0] network_features['Network Name'] = netclass[1] else: network_features['group'] = '_unknown_' network_features['Network Name'] = network_name if graph.is_directed(): network_features['Is Directed?'] = True else: network_features['Is Directed?'] = False if graph.is_multigraph(): network_features['Is MultiGraph?'] = True else: network_features['Is MultiGraph?'] = False # Global Attributes # -------------------------------------------------------------------------------- # number of nodes # -------------------------------------------------------------------------------- if _nn: try: nodes_count = nx.number_of_nodes(graph) network_features['Number of Nodes'] = nodes_count except: network_features['Number of Nodes'] = 'NA' # number of edges # -------------------------------------------------------------------------------- if _ne: try: edges_count = nx.number_of_edges(graph) network_features['Number of Edges'] = edges_count except: network_features['Number of Edges'] = 'NA' # network density # -------------------------------------------------------------------------------- if _dens: try: density = nx.density(graph) network_features['Density'] = density except: network_features['Density'] = 'NA' # graph degree assortativity # -------------------------------------------------------------------------------- if _dac: try: graph_degree_assortativity = nx.degree_assortativity_coefficient( graph) network_features[ 'Graph Degree Assortativity'] = graph_degree_assortativity except: network_features['Graph Degree Assortativity'] = 'NA' # avg. closeness centrality # -------------------------------------------------------------------------------- if _acc: try: ccn = nx.closeness_centrality(graph) mccn = np.mean(ccn.values()) network_features['Avg. Closeness Centrality'] = mccn except: network_features['Avg. Closeness Centrality'] = 'NA' # avg. betweenness centrality # -------------------------------------------------------------------------------- if _abc: try: bcn = nx.betweenness_centrality(graph) mbcn = np.mean(bcn.values()) network_features['Avg. Betweenness Centrality'] = mbcn except: network_features['Avg. Betweenness Centrality'] = 'NA' # avg. degree centrality # -------------------------------------------------------------------------------- if _adc: try: dcn = nx.degree_centrality(graph) mdcn = np.mean(dcn.values()) network_features['Avg. Degree Centrality'] = mdcn except: network_features['Avg. Degree Centrality'] = 'NA' # avg. degree connectivity # -------------------------------------------------------------------------------- if _adcon: try: dc = nx.average_degree_connectivity(graph) adc = np.mean(dc.values()) network_features['Avg. Degree Connectivity'] = adc except: network_features['Avg. Degree Connectivity'] = 'NA' # avg. load centrality # -------------------------------------------------------------------------------- if _alc: try: lc = nx.load_centrality(graph) mlc = np.mean(lc.values()) network_features['Avg. Load Centrality'] = mlc except: network_features['Avg. Load Centrality'] = 'NA' # avg. edge betweenness centrality # -------------------------------------------------------------------------------- # try: # ebc = nx.edge_betweenness_centrality(graph) # mebc = np.mean(ebc.values()) # network_features['Avg. Edge Betweenness centrality'] = mebc # except: # network_features['Avg. Edge Betweenness centrality'] = 'NA' # edge connectivity # -------------------------------------------------------------------------------- # try: # ec = nx.edge_connectivity(graph) # network_features['Edge Connectivity'] = ec # except: # network_features['Edge Connectivity'] = 'NA' # diameter # -------------------------------------------------------------------------------- if _nd: try: diameter = nx.diameter(graph) network_features['Diameter'] = diameter except: network_features['Diameter'] = 'NA' # eccentricity # -------------------------------------------------------------------------------- if _ae: try: eccentricity = nx.eccentricity(graph) network_features['Avg. Eccentricity'] = np.mean( eccentricity.values()) except: network_features['Eccentricity'] = 'NA' # radius # -------------------------------------------------------------------------------- if _rad: try: radius = nx.radius(graph) network_features['Radius'] = radius except: network_features['Radius'] = 'NA' # Non MultiGraph Features # -------------------------------------------------------------------------------- if not graph.is_multigraph(): # transitivity # ---------------------------------------------------------------------------- if _trans: try: transitivity = nx.transitivity(graph) network_features['Transitivity'] = transitivity except: network_features['Transitivity'] = 'NA' # Katz centrality # ---------------------------------------------------------------------------- if _akc: try: katz = nx.katz_centrality(graph) mean_katz = np.mean(katz.values()) network_features['Avg. Katz Centrality'] = mean_katz except: network_features['Avg. Katz Centrality'] = 'NA' # PageRank # ---------------------------------------------------------------------------- if _ap: try: pagerank = nx.pagerank(graph) mean_pagerank = np.mean(pagerank.values()) network_features['Avg. PageRank'] = mean_pagerank except: network_features['Avg. PageRank'] = 'NA' # Undirected Graphs # -------------------------------------------------------------------------------- if not nx.is_directed(graph): # Degree # ---------------------------------------------------------------------------- # # try: # all_degrees = nx.degree(graph) # mean_degrees = np.mean(all_degrees.values()) # network_features['Avg. Degree'] = mean_degrees # except: # network_features['Avg. Degree'] = 'NA' # connected components # ---------------------------------------------------------------------------- if _nocc: try: cc_number = nx.number_connected_components(graph) network_features['Number of Connected Components'] = cc_number except: network_features['Number of Connected Components'] = 'NA' # lcc size fraction && avg. cc size # ---------------------------------------------------------------------------- if _accs or _lcc_size: try: cc_list = list(nx.connected_components(graph)) cc_sizes = [] for cc in cc_list: cc_sizes.append(len(cc)) lcc_size = np.max(cc_sizes) if _accs: network_features['lcc_size_fraction'] = lcc_size / float( nodes_count) if _lcc_size: mean_cc_sizes = np.mean(cc_sizes) network_features[ 'Avg. Connected Component Size'] = mean_cc_sizes except: if _accs: network_features['lcc_size_fraction'] = 'NA' if _lcc_size: network_features['Avg. Connected Component Size'] = 'NA' # communicability centrality for Undirected networks # ---------------------------------------------------------------------------- if not graph.is_multigraph(): if _acoc: try: cc = nx.communicability_centrality(graph) mcc = np.mean(cc.values()) network_features['Avg. Communicability Centrality'] = mcc except: network_features['Avg. Communicability Centrality'] = 'NA' # clustering coefficient # ------------------------------------------------------------------------- if _ncc: try: clustering_coefficient = nx.average_clustering(graph) network_features[ 'Network Clustering Coefficient'] = clustering_coefficient except: network_features['Network Clustering Coefficient'] = 'NA' # clique analysis for Undirected networks # ---------------------------------------------------------------------------- if _max_cs: try: cliques_obj = nx.find_cliques(graph) cliques = [clq for clq in cliques_obj] clique_sizes = [] for c in cliques: clique_sizes.append(len(c)) # user_clique_size = 5 if len(clique_sizes) > 0: # network_features['No of Cliques with size ' + str(user_clique_size)] \ # = clique_sizes.count(user_clique_size) network_features['Avg. Clique Size'] = np.mean( clique_sizes) network_features['Max Clique Size'] = np.max(clique_sizes) else: # network_features['No of Cliques with size ' + str(user_clique_size)] = 0 network_features['Avg. Clique Size'] = 0 network_features['Max Clique Size'] = 0 except: # network_features['No of Cliques with size ' + str(user_clique_size)] = 'NA' network_features['Avg. Clique Size'] = 'NA' network_features['Max Clique Size'] = 'NA' # else: # try: # all_in_degrees = nx.DiGraph.in_degree(graph) # all_out_degrees = nx.DiGraph.out_degree(graph) # # mean_in_degrees = np.mean(all_in_degrees.values()) # mean_out_degrees = np.mean(all_out_degrees.values()) # # network_features['Avg. In Degree'] = mean_in_degrees # network_features['Ave. Out Degree'] = mean_out_degrees # except: # network_features['Avg. In Degree'] = 'NA' # network_features['Ave. Out Degree'] = 'NA' # Nodes Features Calculation for node in graph.nodes(): node_features = dict() try: node_features['group'] = network_name except: node_features['group'] = 'NA' if _abc: try: node_features['Betweenness Centrality'] = bcn[node] except: node_features['Betweenness Centrality'] = 'NA' if _acc: try: node_features['Closeness Centrality'] = ccn[node] except: node_features['Closeness Centrality'] = 'NA' if _adc: try: node_features['Degree Centrality'] = dcn[node] except: node_features['Degree Centrality'] = 'NA' if _alc: try: node_features['Load Centrality'] = lc[node] except: node_features['Load Centrality'] = 'NA' if _ae: try: node_features['Eccentricity'] = eccentricity[node] except: node_features['Eccentricity'] = 'NA' if not graph.is_multigraph(): if _akc: try: node_features['Katz Centrality'] = katz[node] except: node_features['Katz Centrality'] = 'NA' if _ap: try: node_features['PageRank'] = pagerank[node] except: node_features['PageRank'] = 'NA' if not nx.is_directed(graph): # try: # node_features['Degree'] = all_degrees[node] # except: # node_features['Degree'] = 'NA' if not graph.is_multigraph(): if _acoc: try: node_features['Communicability Centrality'] = cc[node] except: node_features['Communicability Centrality'] = 'NA' # else: # try: # node_features['In Degree'] = all_in_degrees[node] # except: # node_features['In Degree'] = 'NA' # # try: # node_features['Out Degree'] = all_out_degrees[node] # except: # node_features['Out Degree'] = 'NA' node_features_list.append(node_features) return network_features, node_features_list
def centrality(self): result = {} result['degree_centrality'] = nx.degree_centrality(self.graph) if self.directed == 'directed': result['in_degree_centrality'] = nx.in_degree_centrality( self.graph) result['out_degree_centrality'] = nx.out_degree_centrality( self.graph) result['closeness_centrality'] = nx.closeness_centrality(self.graph) result['betweenness_centrality'] = nx.betweenness_centrality( self.graph) # fix the tuple cant decode into json problem stringify_temp = {} temp = nx.edge_betweenness_centrality(self.graph) for key in temp.keys(): stringify_temp[str(key)] = temp[key] result['edge_betweenness_centrality'] = stringify_temp if self.directed == 'undirected': result[ 'current_flow_closeness_centrality'] = nx.current_flow_closeness_centrality( self.graph) result[ 'current_flow_betweenness_centrality'] = nx.current_flow_betweenness_centrality( self.graph) stringify_temp = {} temp = nx.edge_current_flow_betweenness_centrality(self.graph) for key in temp.keys(): stringify_temp[str(key)] = temp[key] result['edge_current_flow_betweenness_centrality'] = stringify_temp result[ 'approximate_current_flow_betweenness_centrality'] = nx.approximate_current_flow_betweenness_centrality( self.graph) result['eigenvector_centrality'] = nx.eigenvector_centrality( self.graph) result[ 'eigenvector_centrality_numpy'] = nx.eigenvector_centrality_numpy( self.graph) result['katz_centrality'] = nx.katz_centrality(self.graph) result['katz_centrality_numpy'] = nx.katz_centrality_numpy( self.graph) result['communicability'] = nx.communicability(self.graph) result['communicability_exp'] = nx.communicability_exp(self.graph) result[ 'communicability_centrality'] = nx.communicability_centrality( self.graph) result[ 'communicability_centrality_exp'] = nx.communicability_centrality_exp( self.graph) result[ 'communicability_betweenness_centrality'] = nx.communicability_betweenness_centrality( self.graph) result['estrada_index'] = nx.estrada_index(self.graph) result['load_centrality'] = nx.load_centrality(self.graph) stringify_temp = {} temp = nx.edge_load(self.graph) for key in temp.keys(): stringify_temp[str(key)] = temp[key] result['edge_load'] = stringify_temp result['dispersion'] = nx.dispersion(self.graph) fname_centra = self.DIR + '/centrality.json' with open(fname_centra, "w") as f: json.dump(result, f, cls=SetEncoder, indent=2) print(fname_centra)
def communicability_centrality_exp(self): self.communicability_centrality_exp_dict = nx.communicability_centrality(self.G)
plt.plot(fraction_of_nodes, size_max_component, label='Grado', linewidth=2) # --------------- Remuevo por subgraph centrality -------------- # size_max_component = np.zeros(nodes_to_remove, dtype=float) graph_aux = deepcopy(graph) graph_aux_nx = deepcopy(graph_nx) for j in range(nodes_to_remove): # Calculo y guardo el size de la componente mas grande graph_aux2 = graph_aux.clusters() size_max_component[j] += float(max(graph_aux2.sizes())) \ / size_of_large_connected_component criteria = nx.communicability_centrality(graph_aux_nx).items() criteria.sort(reverse=True, key=lambda item: item[1]) # Tomo el primer elemento, de mayor subgraph centrality vertex_ind = criteria[0][0] # Remuevo el vertice graph_aux.delete_vertices(vertex_ind) graph_aux_nx.remove_node(vertex_ind) plt.figure(1) plt.plot(fraction_of_nodes, size_max_component, label='SubGraph (iter)', linewidth=2) # --------- Lo hago todo de una ----------- #
g5.add_edge(fid, partner, form=int(form), aggform=int(aggform), consolform=int(consolform)) d=nx.degree(mg) nx.set_node_attributes(mg,'d',d) dc=nx.degree_centrality(mg) nx.set_node_attributes(mg,'dc',dc) ec=nx.eigenvector_centrality(g, 10000) nx.set_node_attributes(g,'ec',ec) bc=nx.betweenness_centrality(mg) nx.set_node_attributes(mg,'bc',bc) cc=nx.closeness_centrality(mg) nx.set_node_attributes(mg,'cc',cc) cl=nx.clustering(g) nx.set_node_attributes(g,'cl',cl) co=nx.communicability_centrality(g) nx.set_node_attributes(g,'co',co) d=nx.degree(g1) nx.set_node_attributes(g1,'d',d) d=nx.degree(g2) nx.set_node_attributes(g2,'d',d) d=nx.degree(g3) nx.set_node_attributes(g3,'d',d) d=nx.degree(g4) nx.set_node_attributes(g4,'d',d) d=nx.degree(g5) nx.set_node_attributes(g5,'d',d) #projected eigenvector centrality bio_nodes = set(n for n in g.nodes() if n < 1000 and n > 0)
def createGraphFeatures(num_documents,clean_train_documents,unique_words,bigrams,sliding_window,b,idf_par,centrality_par,centrality_col_par,train_par,idf_learned,icw_learned,kcore_par,dGcol_nodes,max_core_col,kcore_par_int,max_core_feat,feature_reduction,avgLen): features = np.zeros((num_documents,len(unique_words))) unique_words_len = len(unique_words) term_num_docs = {} print "sliding_window:"+str(sliding_window) if train_par: print "Training set..." idfs = {} dGcol_nodes = {} icws = {} max_core_feat = [] print "Creating the graph of words for collection..." if centrality_col_par=="pagerank_centrality" or centrality_col_par=="in_degree_centrality" or centrality_col_par=="out_degree_centrality" or centrality_col_par=="closeness_centrality_directed" or centrality_col_par=="betweenness_centrality_directed": dGcol = nx.DiGraph() else: dGcol = nx.Graph() totalLen = 0 for i in range( 0,num_documents ): #dG = nx.Graph() found_unique_words = [] wordList1 = clean_train_documents[i].split(None) wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1] docLen = len(wordList2) totalLen += docLen # print clean_train_documents[i] if len(wordList2)>1: for k, word in enumerate(wordList2): if word not in found_unique_words: found_unique_words.append(word) if word not in term_num_docs: term_num_docs[word] = 1 else: term_num_docs[word] += 1 for j in xrange(1,sliding_window): try: next_word = wordList2[k + j] # print word+"\t"+next_word # time.sleep(2) if not dGcol.has_node(word): dGcol.add_node(word) dGcol.node[word]['count'] = 1 else: dGcol.node[word]['count'] += 1 if not dGcol.has_node(next_word): dGcol.add_node(next_word) dGcol.node[next_word]['count'] = 0 if not dGcol.has_edge(word, next_word): dGcol.add_edge(word, next_word, weight = 1) else: dGcol.edge[word][next_word]['weight'] += 1 except IndexError: if not dGcol.has_node(word): dGcol.add_node(word) dGcol.node[word]['count'] = 1 else: dGcol.node[word]['count'] += 1 except: raise print "Number of self-loops for collection graph:"+str(dGcol.number_of_selfloops()) dGcol.remove_edges_from(dGcol.selfloop_edges()) collection_count_nodes = dGcol.number_of_nodes() collection_count_edges = dGcol.number_of_edges() print "Number of nodes in collection graph:"+str(collection_count_nodes) print "Number of edges in collection graph:"+str(collection_count_edges) avgLen = float(totalLen)/num_documents print "Average document length:"+str(avgLen) if idf_par=="icw" or idf_par=="icw+idf" or idf_par=="tf-icw": icw_col = {} if(kcore_par=="A1" or kcore_par=="A2"): collection_core = nx.core_number(dGcol) max_core = max(collection_core.values()) print "Max core of collection:"+str(max_core) # core_Size_Distribution(collection_core) for k,g in enumerate(dGcol.nodes()): if kcore_par=="A1": # A1 method: remove features and then rank for x in range(0,kcore_par_int): if collection_core[g]==max_core-x: dGcol.remove_node(g) else: # A2 method: rank first and then remove features for x in range(0,kcore_par_int): if collection_core[g]==max_core-x: max_core_col.append(g) if centrality_col_par == "degree_centrality": centrality_col = nx.degree_centrality(dGcol) elif centrality_col_par=="in_degree_centrality": centrality_col = nx.in_degree_centrality(dGcol) elif centrality_col_par=="out_degree_centrality": centrality_col = nx.out_degree_centrality(dGcol) elif centrality_col_par == "pagerank_centrality": # centrality_col = pg.pagerank(dGcol,max_iter=1000) centrality_col = nx.pagerank(dGcol) elif centrality_col_par == "eigenvector_centrality": centrality_col = nx.eigenvector_centrality(dGcol,max_iter=1000) elif centrality_col_par == "betweenness_centrality" or centrality_col_par=="betweenness_centrality_directed": centrality_col = nx.betweenness_centrality(dGcol) elif centrality_col_par == "triangles": centrality_col = nx.triangles(dGcol) elif centrality_col_par == "clustering_coefficient": centrality_col = nx.clustering(dGcol) elif centrality_col_par == "core_number": centrality_col = nx.core_number(dGcol) elif centrality_col_par == "closeness_centrality" or centrality_col_par=="closeness_centrality_directed": centrality_col = nx.closeness_centrality(dGcol) elif centrality_col_par == "closeness_centrality_weighted": centrality_col = nx.closeness_centrality(dGcol) elif centrality_col_par == "communicability_centrality": centrality_col = nx.communicability_centrality(dGcol) centr_sum = sum(centrality_col.values()) for k,g in enumerate(dGcol.nodes()): if centrality_col[g]!=0: if idf_par=="icw" or idf_par=="tf-icw" or idf_par=="icw+idf": icw_col[g] = math.log10(float(centr_sum)/centrality_col[g]) else: icw_col[g] = 0 # elif idf_par=="idf": idf_col = {} for x in term_num_docs: if idf_par=="idf": idf_col[x] = math.log10((float(num_documents)+1.0) / term_num_docs[x]) elif idf_par=="icw+idf": idf_col[x] = math.log10((float(num_documents)+1.0) / term_num_docs[x]) dGcol_nodes = dGcol.nodes() # for the testing set else: if idf_par=="idf": idf_col = idf_learned elif idf_par=="icw" or idf_par=="tf-icw": icw_col = icw_learned elif idf_par=="icw+idf": idf_col = idf_learned icw_col = icw_learned collection_count_nodes = 0 collection_count_edges = 0 # nx.write_edgelist(dGcol,"test.edgelist",data=True,delimiter="\t") print "Creating the graph of words for each document..." totalNodes = 0 totalEdges = 0 corrs_per_category = [[] for i in range(4)] for i in range( 0,num_documents ): if centrality_par=="pagerank_centrality" or centrality_par=="in_degree_centrality" or centrality_par=="out_degree_centrality" or centrality_par=="closeness_centrality_directed" or centrality_par=="betweenness_centrality_directed": dG = nx.DiGraph() else: dG = nx.Graph() wordList1 = clean_train_documents[i].split(None) wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1] docLen = len(wordList2) if len(wordList2)>1: for k, word in enumerate(wordList2): for j in xrange(1,sliding_window): try: next_word = wordList2[k + j] if not dG.has_node(word): dG.add_node(word) dG.node[word]['count'] = 1 else: dG.node[word]['count'] += 1 if not dG.has_node(next_word): dG.add_node(next_word) dG.node[next_word]['count'] = 1 if not dG.has_edge(word, next_word): dG.add_edge(word, next_word, weight = 1) else: dG.edge[word][next_word]['weight'] += 1 except IndexError: if not dG.has_node(word): dG.add_node(word) dG.node[word]['count'] = 1 else: dG.node[word]['count'] += 1 except: raise dG.remove_edges_from(dG.selfloop_edges()) for node1, node2 in dG.edges_iter(): dG.edge[node1][node2]['inv_weight'] = 1.0 / dG.edge[node1][node2]['weight'] if train_par: if(kcore_par=="B1" or kcore_par=="B2"): max_core_doc = [] document_core = nx.core_number(dG) max_core = max(document_core.values()) # print "Max core of document:"+str(max_core) # core_Size_Distribution(document_core) for k,g in enumerate(dG.nodes()): if kcore_par=="B1": # B1 method: remove features and then rank for x in range(0,kcore_par_int): if document_core[g]==max_core-x: dG.remove_node(g) else: # B2 method: rank first and then remove features for x in range(0,kcore_par_int): if document_core[g]==max_core-x: max_core_doc.append(g) if g not in max_core_feat: max_core_feat.append(g) # centrality = nx.degree_centrality(dG) #centrality = nx.core_number(dG) if centrality_par == "degree_centrality": centrality = nx.degree_centrality(dG) elif centrality_par == "in_degree_centrality": centrality = nx.in_degree_centrality(dG) elif centrality_par == "out_degree_centrality": centrality = nx.out_degree_centrality(dG) elif centrality_par == "pagerank_centrality": # centrality = pg.pagerank(dG,max_iter=1000) centrality = nx.pagerank(dG) elif centrality_par =="betweenness_centrality" or centrality_par=="betweenness_centrality_directed": centrality = nx.betweenness_centrality(dG,weight="weight") elif centrality_par =="triangles": centrality = nx.triangles(dG) elif centrality_par =="eigenvector_centrality": centrality = nx.eigenvector_centrality_numpy(dG) elif centrality_par =="core_number": centrality = nx.core_number(dG) elif centrality_par =="clustering_coefficient": centrality = nx.clustering(dG) elif centrality_par == "closeness_centrality" or centrality_par=="closeness_centrality_directed": centrality = nx.closeness_centrality(dG) elif centrality_par == "closeness_centrality_weighted": centrality = nx.closeness_centrality(dG,distance='weight') elif centrality_par == "communicability_centrality": centrality = nx.communicability_centrality(dG) elif centrality_par == "closeness_centrality_not_normalized": centrality = nx.closeness_centrality(dG,normalized=False) elif centrality_par == "degree_centrality_weighted": centrality = weighted_degree_centrality(dG) #print "Number of self-loops:"+str(dG.number_of_selfloops()) #centrality = nx.out_degree_centrality(dG) #centrality = pg.pagerank(dG,max_iter=1000) #centrality = nx.katz_centrality(dG,max_iter=10000) totalNodes += dG.number_of_nodes() totalEdges += dG.number_of_edges() tfs = [] centralities = [] centr_sum_doc = sum(centrality.values()) for k, g in enumerate(dG.nodes()): if g in dGcol_nodes: if kcore_par=="B2": if g in max_core_feat: # Degree centrality (local feature) if g in unique_words: #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g] if idf_par=="no": features[i,unique_words.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen))) elif idf_par=="idf": features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * idf_col[g] elif idf_par=="icw": features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] # features[i,unique_words.index(g)] = centrality[g] * icw_col[g] elif idf_par=="icw+idf": features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g]) elif g in bigrams: #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g] if idf_par=="no": features[i,unique_words_len+bigrams.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen))) elif idf_par=="idf": features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * idf_col[g] elif idf_par=="icw": features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] # features[i,unique_words.index(g)] = centrality[g] * icw_col[g] elif idf_par=="icw+idf": features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g]) else: if g in unique_words: #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g] if idf_par=="no": features[i,unique_words.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen))) tfs.append(wordList2.count(g)) centralities.append(centrality[g]) elif idf_par=="tf-icw": tf_g = 1+math.log(1+math.log(wordList2.count(g))) features[i,unique_words.index(g)] = (tf_g/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] elif idf_par=="idf": features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * idf_col[g] elif idf_par=="icw": features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] # features[i,unique_words.index(g)] = centrality[g] * icw_col[g] elif idf_par=="icw+idf": features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g]) elif g in bigrams: #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g] if idf_par=="no": features[i,unique_words_len+bigrams.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen))) elif idf_par=="idf": features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * idf_col[g] elif idf_par=="icw": features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] # features[i,unique_words.index(g)] = centrality[g] * icw_col[g] elif idf_par=="icw+idf": features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g]) # if train_par: # # pears = pearsonr(tfs,centralities) # ind_tfs = sorted(range(len(tfs)), key=lambda k: tfs[k])[-20:] # ind_centr = sorted(range(len(centralities)), key=lambda k: centralities[k])[-20:] # tau, p_value = kendalltau([unique_words[k] for k in ind_tfs],[unique_words[k] for k in ind_centr]) # corrs_per_category[int(y[i])-1].append(tau) # if train_par: # text_file = open("kendal_tfs_tws_output_tw_idf_"+idf_par+"_centr_"+centrality_par+"_sliding_"+str(sliding_window)+"_kcore_"+kcore_par+".txt", "w") # text_file.write(str(corrs_per_category)) # text_file.close() # fig = plt.figure() # ax = fig.add_subplot(111) # ax.boxplot(corrs_per_category[:]) # plt.show() if idf_par=="no": idfs = {} icws = {} if idf_par=="idf": idfs = idf_col icws = {} elif idf_par=="icw" or idf_par=="tf-icw": idfs = {} icws = icw_col elif idf_par=="icw+idf": idfs = idf_col icws = icw_col if train_par: if kcore_par=="B2": feature_reduction = float(len(max_core_feat))/len(dGcol_nodes) print "Percentage of features kept:"+str(feature_reduction) print "Average number of nodes:"+str(float(totalNodes)/num_documents) print "Average number of edges:"+str(float(totalEdges)/num_documents) return features, idfs,icws,collection_count_nodes, collection_count_edges, dGcol_nodes,max_core_col,feature_reduction, max_core_feat,avgLen
def createGraphFeatures(num_documents, clean_train_documents, unique_words, bigrams, sliding_window, b, idf_par, centrality_par, centrality_col_par, train_par, idf_learned, icw_learned, dGcol_nodes, avgLen, path, y_train): #features = np.zeros((num_documents,len(unique_words))) features = lil_matrix((num_documents,len(unique_words))) unique_words_len = len(unique_words) term_num_docs = {} if train_par: print("Training set...") if centrality_col_par=="weighted_degree_centrality" or centrality_col_par=="weighted_pagerank_centrality": tf_par = "word2vec" getOnlyDataWord2VecModel(clean_train_documents) else: tf_par = "word2ve" print("sliding_window:"+str(sliding_window)) idfs = {} dGcol_nodes = {} icws = {} max_core_feat = [] ## this is for the label graphs dGlabels = [] totalLen = 0 totalDiam = 0 for label in list(set(y_train)): dGlabels.append(nx.Graph()) # ## IDW # print("Creating the graph of documents (IDW).." # # getOnlyDataWord2VecModel(clean_train_documents) # # all_doc_nodes = [] # for i in range( 0,num_documents ): # all_doc_nodes.append(i) # # edges = combinations(all_doc_nodes, 2) # dGdocs = nx.Graph() # # vectorizer = TfidfVectorizer(min_df=1) # tf_idf_matrix = vectorizer.fit_transform(clean_train_documents) # for e in edges: # # dGdocs.add_edge(e,weight=metrics.pairwise.cosine_similarity(w2v.wv.wmdistance(clean_train_documents[e[0]],clean_train_documents[e[1]]))) # vect = TfidfVectorizer(min_df=1) # tfidf = vect.fit_transform([clean_train_documents[e[0]],clean_train_documents[e[1]]]) # dGdocs.add_edge(e[0],e[1],weight=tfidf[0,1]) # t1 = time.time() # matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), len(clean_train_documents), 0) # t = time.time()-t1 # print("SELFTIMED:"+str(t) # # # matches_df = get_matches_df(matches, clean_train_documents) # for e in edges: # dGdocs.add_edge(e[0],e[1],weight=matches[e[0],e[1]]) # del matches if not os.path.exists(path+"_collection_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist"): print("Creating the graph of words for collection...") if centrality_col_par=="pagerank_centrality" or centrality_col_par=="in_degree_centrality" or \ centrality_col_par=="out_degree_centrality" or centrality_col_par=="closeness_centrality_directed" or \ centrality_col_par=="betweenness_centrality_directed" or centrality_col_par=="weighted_pagerank_centrality": dGcol = nx.DiGraph() else: dGcol = nx.Graph() totalLen = 0 totalDiam = 0 for i in range(num_documents ): # dG = nx.Graph() graphVizualizeFlag=False if graphVizualizeFlag: if i is not 0 and (i%25==0): print(i) print("dGcol.number_of_nodes()", dGcol.number_of_nodes()) st = time.time() save_graph(dGcol, "graph_"+str(i)+".pdf") fi = time.time() - st print('time:', fi) print('=================================') lg = int(y_train[i]) found_unique_words = [] wordList1 = clean_train_documents[i].split(None) wordList2 = [x.rstrip(',.!?;') for x in wordList1] docLen = len(wordList2) # print(clean_train_documents[i] #if len(wordList2)>1: totalLen += docLen for k, word in enumerate(wordList2): if word not in found_unique_words: found_unique_words.append(word) if word not in term_num_docs: term_num_docs[word] = 1 else: term_num_docs[word] += 1 for j in range(1, sliding_window): try: next_word = wordList2[k + j] # print(word+"\t"+next_word # time.sleep(2) if not dGcol.has_node(word): dGcol.add_node(word) dGcol.node[word]['count'] = 1 else: dGcol.node[word]['count'] += 1 if not dGcol.has_node(next_word): dGcol.add_node(next_word) dGcol.node[next_word]['count'] = 1 else: dGcol.node[next_word]['count'] +=1 if not dGcol.has_edge(word, next_word): dGcol.add_edge(word, next_word, weight = 1) # dGcol.edge[word][next_word]['w2vec'] = 0.01 if tf_par=="word2vec": if word in model.wv.vocab and next_word in model.wv.vocab: dGcol.edge[word][next_word]['w2vec'] = model.wv.similarity(word,next_word) # dGcol.edge[word][next_word]['w2vec'] = np.linalg.norm(model[word]-model[next_word]) else: dGcol.edge[word][next_word]['weight'] += 1 ## this is for label graphs if not dGlabels[lg].has_node(word): dGlabels[lg].add_node(word) dGlabels[lg].node[word]['count'] = 1 else: dGlabels[lg].node[word]['count'] += 1 if not dGlabels[lg].has_node(next_word): dGlabels[lg].add_node(next_word) dGlabels[lg].node[next_word]['count'] = 1 else: dGlabels[lg].node[next_word]['count'] +=1 if not dGlabels[lg].has_edge(word, next_word): dGlabels[lg].add_edge(word, next_word, weight = 1) # dGcol.edge[word][next_word]['w2vec'] = 0.01 if tf_par=="word2vec": if word in model.wv.vocab and next_word in model.wv.vocab: dGlabels[lg].edge[word][next_word]['w2vec'] = model.wv.similarity(word,next_word) else: dGlabels[lg].edge[word][next_word]['weight'] += 1 # # # again for average,5,6,7,8,9 # if not dG.has_edge(word, next_word): # dG.add_edge(word, next_word, weight = 1) # else: # dG.edge[word][next_word]['weight'] += 1 except IndexError: if not dGcol.has_node(word): dGcol.add_node(word) dGcol.node[word]['count'] = 1 else: dGcol.node[word]['count'] += 1 if not dGlabels[lg].has_node(word): dGlabels[lg].add_node(word) dGlabels[lg].node[word]['count'] = 1 else: dGlabels[lg].node[word]['count'] += 1 except: raise # nx.draw(dG,pos=nx.spring_layout(dG)) # plt.show() # nx.write_edgelist(dG,path+"_YO_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist") # raw_input("enter") # totalDiam += nx.diameter(dG) # nx.write_edgelist(dGcol,path+"_collection_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist") # json.dump(term_num_docs,open(path+"_term_num_docs"+str(sliding_window)+".txt","w")) else: print("Parsing the graph of words for collection...") # term_num_docs = json.load(open(path+"_term_num_docs"+str(sliding_window)+".txt","r")) # dGcol = nx.read_edgelist(path+"_collection_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist") print("Number of self-loops for collection graph:"+str(dGcol.number_of_selfloops())) dGcol.remove_edges_from(dGcol.selfloop_edges()) collection_count_nodes = dGcol.number_of_nodes() collection_count_edges = dGcol.number_of_edges() print("Number of nodes in collection graph:"+str(collection_count_nodes)) print("Number of edges in collection graph:"+str(collection_count_edges)) # plot_degree_histogram(dGcol) # raw_input("enter") # avgLen = float(totalLen)/num_documents avgLen = 0 # colDiam = nx.diameter(dGcol) # avgDiam = float(totalDiam)/num_documents print("Average document length:"+str(avgLen)) if idf_par=="icw" or idf_par=="icw+idf" or idf_par=="tf-icw" or idf_par=="icw-lw": icw_col = {} if tf_par=="word2vec": for u,v,d in dGcol.edges(data=True): if 'w2vec' in d: ## my w2v similarity dGcol.edge[u][v]['w2vec'] = np.arccos(d['w2vec'])/math.pi dGcol.edge[u][v]['w2vec'] = 1-dGcol.edge[u][v]['w2vec'] dGcol.edge[u][v]['weight'] = dGcol.edge[u][v]['w2vec'] ## attraction score # f_u_v = float(dGcol.node[u]['count']*dGcol.node[v]['count'])/(d['w2vec']**2) # dice = float(2*d['weight'])/(dGcol.node[u]['count']+dGcol.node[v]['count']) # dGcol.edge[u][v]['weight'] = f_u_v * dice #dGcol.edge[u][v]['weight'] = d['weight']*dGcol.edge[u][v]['w2vec'] #dGcol.edge[u][v]['weight'] = float(d['weight'])/(dGcol.edge[u][v]['w2vec']**2) else: # dGcol.edge[u][v]['weight'] = np.arccos(0.0001)/math.pi dGcol.edge[u][v]['weight'] = 0.0001 if centrality_col_par == "degree_centrality": centrality_col = nx.degree_centrality(dGcol) elif centrality_col_par == "weighted_degree_centrality": # centrality_col = nx.degree_centrality(dGcol,weight='weight') centrality_col = dGcol.degree(weight='weight') elif centrality_col_par=="in_degree_centrality": centrality_col = nx.in_degree_centrality(dGcol) elif centrality_col_par=="out_degree_centrality": centrality_col = nx.out_degree_centrality(dGcol) elif centrality_col_par == "pagerank_centrality": centrality_col = nx.pagerank(dGcol) elif centrality_col_par == "weighted_pagerank_centrality": centrality_col = nx.pagerank(dGcol,weight="weight") elif centrality_col_par == "eigenvector_centrality": centrality_col = nx.eigenvector_centrality(dGcol,max_iter=1000) elif centrality_col_par == "betweenness_centrality" or centrality_col_par=="betweenness_centrality_directed": centrality_col = nx.betweenness_centrality(dGcol) elif centrality_col_par == "triangles": centrality_col = nx.triangles(dGcol) elif centrality_col_par == "clustering_coefficient": centrality_col = nx.clustering(dGcol) elif centrality_col_par == "core_number": centrality_col = nx.core_number(dGcol) elif centrality_col_par == "closeness_centrality" or centrality_col_par=="closeness_centrality_directed": centrality_col = nx.closeness_centrality(dGcol) elif centrality_col_par == "closeness_centrality_weighted": centrality_col = nx.closeness_centrality(dGcol) elif centrality_col_par == "communicability_centrality": centrality_col = nx.communicability_centrality(dGcol) centrality_labels = [] # partition = community.best_partition(dGcol) # # all_nodes = [] # partitions = [] # count = 0 # for com in set(partition.values()): # count = count + 1 # list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] # partitions.append(list_nodes) # # # print("Clusters:"+str(len(partitions)) # # # lens = [len(partition) for partition in partitions] # print(lens # t = lens.index(max(lens)) # # print("len of biggest cluster:"+str(len(partitions[t])) # raw_input("enter") for i, dGlabel in enumerate(dGlabels): # centrality_labels.append(nx.pagerank(dGlabel)) # centrality_labels.append(nx.degree_centrality(dGlabel)) # print("before:"+str(dGlabel.number_of_nodes()) ## this is for clustering # partition = community.best_partition(dGlabel) # all_nodes = [] # count = 0 # for com in set(partition.values()): # count = count + 1 # list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] # all_nodes.append(list_nodes) # # partitions.append(all_nodes) # # print(str(i)+": "+str(count)+" clusters" ) # G = dGlabel.copy() # setA = set(dGlabel.nodes()) # setB = set(partitions[t]) # dGlabel.remove_nodes_from(list(setB)) # setB = set(partitions[1]) # dGlabel.remove_nodes_from(list(setB)) # setB = set(partitions[1]) # dGlabel.remove_nodes_from(list(setB)) print("after:"+str(dGlabel.number_of_nodes())) centrality_labels.append(nx.degree_centrality(dGlabel)) # raw_input("en") centr_sum = sum(centrality_col.values()) # centr_sum = max(centrality_col.values()) # print(centr_sum) minc = [min(d.values()) for d in centrality_labels] minc = min(minc) for k,g in enumerate(dGcol.nodes()): if centrality_col[g]!=0: if idf_par=="icw" or idf_par=="tf-icw" or idf_par=="icw+idf": #print(centrality_col[g]) # icw_col[g] = math.log10(float(centr_sum*num_documents)/(centrality_col[g]*term_num_docs[g])) # print(g) seq = [x.get(g, 0) for x in centrality_labels] centr_max_c = max(seq) ind_max = seq.index(centr_max_c) # print(g) # topics = [] # for i, partition in enumerate(partitions): # for w in partitions[ind_max]: # if g in w: # topics = w # print(str(topics)) # raw_input("enter") # all_words = centrality_labels[ind_max].keys - # sum_all_topics = sum([centrality_labels[ind_max].get(word, 0) for word in topics]) # sum_all_topics = sum([centrality_col.get(word, 0) for word in topics]) # G = dGlabels[ind_max].copy() # # print("before:"+str(G.number_of_nodes()) # G = dGcol.copy() # setA = set(G.nodes()) # setB = set(partitions[t]) # G.remove_nodes_from(list(setB)) # # G.remove_nodes_from(list(setB)) # # # print("after:"+str(G.number_of_nodes()) # if G.degree(g): # centr_max_c = G.degree(g) # # print(sum_all_topics) # # raw_input("enter") centr_sum_c = sum(seq) n_el = sum(s>0 for s in seq) # dGlab = seq.index(centr_max_c) del seq[ind_max] # centr_sum_lab = sum(seq) # print(seq) # raw_input("enter") term_graphs = [] for j,doc in enumerate(dGdocs.nodes()): if g in clean_train_documents[j].split(): term_graphs.append(dGdocs.degree(j,weight='weight')) avg_term = np.mean(term_graphs) # print(avg_term) max_term = sum(term_graphs) #. # icw_col[g] = math.log10((float(centr_sum)/centrality_col[g]) * (float(max_term)/avg_term)) # icw_col[g] = math.log10(float(max_term)/avg_term) icw_col[g] = math.log10((float(centr_sum)/centrality_col[g]) * (float(centr_max_c)/max(np.mean(seq),minc))) # a = np.mean(seq) # crc = 2 + ((centr_max_c/max(a,minc)*(float(len(centrality_labels))/n_el))) # icw_col[g] = math.log(crc,2) # icw_col[g] = math.log10((float(centr_sum)/centrality_col[g])) * math.log(crc,2) elif idf_par=="icw-lw": icw_col[g] = math.log10((float(centr_sum)/centrality_col[g])) else: icw_col[g] = 0 # elif idf_par=="idf": idf_col = {} if idf_par=="idf" or idf_par=="icw+idf": for x in term_num_docs: idf_col[x] = math.log10(float(num_documents) / term_num_docs[x]) dGcol_nodes = dGcol.nodes() #save_graph(dGcol, "graph_split.pdf") dGcol.clear() # for the testing set else: if idf_par=="idf": idf_col = idf_learned elif idf_par=="icw" or idf_par=="tf-icw": icw_col = icw_learned elif idf_par=="icw+idf": idf_col = idf_learned icw_col = icw_learned collection_count_nodes = 0 collection_count_edges = 0 totalNodes = 0 totalEdges = 0 corrs_per_category = [[] for i in range(4)] counter_word2vec = [] # print("number of word2vec words in docs:"+str(len(counter_word2vec)) if idf_par=="no": idfs = {} icws = {} if idf_par=="idf": idfs = idf_col icws = {} elif idf_par=="icw" or idf_par=="tf-icw" or idf_par=="icw-lw": idfs = {} icws = icw_col elif idf_par=="icw+idf": idfs = idf_col icws = icw_col processes = cpu_count() # processes=1 all_pairs,idx = chunkIt(clean_train_documents,processes) y_final = [] pool = Pool(processes) print("Number of processes:"+str(processes)) results = [pool.apply_async( splitGraphFeatures, (t, idx[k], idf_par, centrality_par, dGcol_nodes, idfs, icws, sliding_window, unique_words, train_par,path)) for k, t in enumerate(all_pairs)] count_rows = 0 for i,result in enumerate(results): r,y = result.get() for y_ind,row in enumerate(r): features[count_rows,:] = row[:] #y_final.append(y_train[y_ind]) count_rows += 1 pool.close() # if train_par: # print("Average number of nodes:"+str(float(totalNodes)/num_documents) # print("Average number of edges:"+str(float(totalEdges)/num_documents) # all_pairs,idx = chunkIt(clean_train_documents,1) # r,y = splitGraphFeatures(all_pairs[0],idx[0], idf_par,centrality_par, dGcol_nodes,idfs,icws, sliding_window,unique_words,train_par,path) # # count_rows = 0 # for y_ind,row in enumerate(r): # features[count_rows,:] = row[:] # count_rows += 1 return features, idfs, icws, collection_count_nodes, collection_count_edges, dGcol_nodes, avgLen
def splitGraphFeatures(documents,idx,idf_par,centrality_par,dGcol_nodes, idf_col,icw_col,sliding_window,unique_words,train_par,path): features = np.zeros((len(documents),len(unique_words))) # features = csr_matrix((len(documents),len(unique_words))) # features = lil_matrix((len(documents),len(unique_words))) if centrality_par=="weighted_degree_centrality" or centrality_par=="weighted_pagerank_centrality": tf_par = "word2vec" global model else: tf_par = "word2ve" if not train_par: path = path+"test_" for i, doc in enumerate(documents): ind = idx[i] if not os.path.exists(path+str(ind)+"_sliding_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist"): # print("Creating the graph of words for documents...") if centrality_par=="pagerank_centrality" or centrality_par=="in_degree_centrality" or centrality_par=="out_degree_centrality" or centrality_par=="closeness_centrality_directed" or centrality_par=="betweenness_centrality_directed" or centrality_par=="weighted_pagerank_centrality": dG = nx.DiGraph() else: dG = nx.Graph() wordList1 = doc.split(None) wordList2 = [x.rstrip(',.!?;') for x in wordList1] docLen = len(wordList2) #if len(wordList2)>1: for k, word in enumerate(wordList2): for j in range(1,sliding_window): try: next_word = wordList2[k + j] if not dG.has_node(word): dG.add_node(word) dG.node[word]['count'] = 1 else: dG.node[word]['count'] += 1 if not dG.has_node(next_word): dG.add_node(next_word) dG.node[next_word]['count'] = 1 else: dG.node[next_word]['count'] += 1 if not dG.has_edge(word, next_word): dG.add_edge(word, next_word, weight = 1) # dG.edge[word][next_word]['w2vec'] = 0.0001 if tf_par=="word2vec": if word in model.wv.vocab and next_word in model.wv.vocab: dG.edge[word][next_word]['w2vec'] = model.wv.similarity(word,next_word) # dG.edge[word][next_word]['w2vec'] = np.linalg.norm(model[word]-model[next_word]) else: dG.edge[word][next_word]['weight'] += 1 except IndexError: if not dG.has_node(word): dG.add_node(word) dG.node[word]['count'] = 1 else: dG.node[word]['count'] += 1 except: raise dG.remove_edges_from(dG.selfloop_edges()) # for node1, node2 in dG.edges_iter(): # dG.edge[node1][node2]['inv_weight'] = 1.0 / dG.edge[node1][node2]['weight'] ## best until now # d['weight'] = d['weight']*((d['w2vec'])**2) # d['weight'] = dice*f # nx.write_edgelist(dG,path+str(ind)+"_sliding_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist",data=True) else: print("Parsing the graph of words for documents...") # dG = nx.read_edgelist(path+str(ind)+"_sliding_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist") if tf_par=="word2vec": for u,v,d in dG.edges(data=True): if 'w2vec' in d: # dice = (2*d['weight'])/(dG.node[u]['count']+dG.node[v]['count']) # dG.edge[u][v]['weight'] = dice * (dG.node[u]['count']*dG.node[v]['count'])/((d['w2vec'])**2) # d['weight'] = (dG.node[u]['count']*dG.node[v]['count'])/((1-d['w2vec'])) ## angular # dice = (2*d['weight'])/(dG.node[u]['count']+dG.node[v]['count']) # f = (dG.node[u]['count']*dG.node[v]['count'])/(d['w2vec']**2) # print(d['w2vec'] # d['weight'] = d['weight']/(d['w2vec']) # if u not in counter_word2vec: # counter_word2vec.append(u) # # if v not in counter_word2vec: # counter_word2vec.append(v) ## my_w2v_similarity dG.edge[u][v]['w2vec'] = np.arccos(d['w2vec'])/math.pi dG.edge[u][v]['w2vec'] = 1-dG.edge[u][v]['w2vec'] dG.edge[u][v]['weight'] = dG.edge[u][v]['w2vec'] ## attraction score # d['w2vec'] = np.arccos(d['w2vec'])/math.pi # f_u_v = float(dG.node[u]['count']*dG.node[v]['count'])/(d['w2vec']**2) # dice = float(2*d['weight'])/(dG.node[u]['count']+dG.node[v]['count']) # dG.edge[u][v]['weight'] = f_u_v * dice else: dG.edge[u][v]['weight'] = 0.0001 # dG.edge[u][v]['weight'] = 1-dG.edge[u][v]['weight'] #if len(dG)>1: if centrality_par == "degree_centrality": centrality = nx.degree_centrality(dG) elif centrality_par == "weighted_degree_centrality": centrality = dG.degree(weight="weight") # centrality = weighted_degree_centrality(dG) elif centrality_par == "in_degree_centrality": centrality = nx.in_degree_centrality(dG) elif centrality_par == "out_degree_centrality": centrality = nx.out_degree_centrality(dG) elif centrality_par == "pagerank_centrality": centrality = nx.pagerank(dG) elif centrality_par == "weighted_pagerank_centrality": centrality = nx.pagerank(dG,weight="weight") elif centrality_par =="betweenness_centrality" or centrality_par=="betweenness_centrality_directed": centrality = nx.betweenness_centrality(dG,weight="weight") elif centrality_par =="triangles": centrality = nx.triangles(dG) elif centrality_par =="eigenvector_centrality": centrality = nx.eigenvector_centrality_numpy(dG) elif centrality_par =="core_number": centrality = nx.core_number(dG) elif centrality_par =="clustering_coefficient": centrality = nx.clustering(dG) elif centrality_par == "closeness_centrality" or centrality_par=="closeness_centrality_directed": centrality = nx.closeness_centrality(dG) elif centrality_par == "closeness_centrality_weighted": centrality = nx.closeness_centrality(dG,distance='weight') elif centrality_par == "communicability_centrality": centrality = nx.communicability_centrality(dG) elif centrality_par == "closeness_centrality_not_normalized": centrality = nx.closeness_centrality(dG,normalized=False) #print("Number of self-loops:"+str(dG.number_of_selfloops()) #centrality = nx.out_degree_centrality(dG) #centrality = pg.pagerank(dG,max_iter=1000) #centrality = nx.katz_centrality(dG,max_iter=10000) # totalNodes += dG.number_of_nodes() # totalEdges += dG.number_of_edges() #if len(dG)>1: for k, g in enumerate(dG.nodes()): if g in dGcol_nodes: if idf_par=="no": features[i,unique_words.index(g)] = centrality[g] #tfs.append(wordList2.count(g)) # centralities.append(centrality[g]) elif idf_par=="tf-icw": #tf_g = 1+math.log(1+math.log(wordList2.count(g))) tf_g = wordList2.count(g) # features[i,unique_words.index(g)] = (tf_g/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] features[i,unique_words.index(g)] = tf_g * icw_col[g] elif idf_par=="idf": features[i,unique_words.index(g)] = centrality[g] * idf_col[g] # features[i,unique_words.index(g)] = centrality[g] * idf_col[g] elif idf_par=="icw" or idf_par=="icw-lw": features[i,unique_words.index(g)] = centrality[g] * icw_col[g] # features[i,unique_words.index(g)] = centrality[g]/(1-b+(b*(float(docDiam)/avgDiam))) * icw_col[g] elif idf_par=="icw+idf": tf_g = wordList2.count(g) #features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g] features[i,unique_words.index(g)] = centrality[g] * icw_col[g] * idf_col[g] #save_graph(dG, "my_graph.pdf") dG.clear() return features,idx
sys.stdout.write(" done\n") sys.stdout.write("calculating PageRank Centrality . . .") PageRankDict = nx.pagerank(G) sys.stdout.write(" done\n") sys.stdout.write("calculating Closeness Centrality . . .") ClosenessDict = nx.closeness_centrality(G) sys.stdout.write(" done\n") sys.stdout.write("calculating Betweenness Centrality . . .") BetweennessDict = nx.betweenness_centrality(G) sys.stdout.write(" done\n") sys.stdout.write("calculating Communicability Centrality . . .") CommunicabilityDict = nx.communicability_centrality(G) sys.stdout.write(" done\n") print "=" * 100 import re csvRegion = list(csv.reader(open("Regions.csv"))) regions = {} for i in range(1, len(csvRegion)): regions[csvRegion[i][2].strip()] = [ re.sub(r'(,)', '/', csvRegion[i][3]), re.sub(r'(,)', '/', csvRegion[i][4]) ] import matplotlib.pyplot as plt nx.draw(G) # networkx draw()
def calculate_centrality(G): # dc_dumps = json.dumps(nx.degree_centrality(G).items(),sort_keys=True,indent=4) # dc_loads = json.loads(dc_dumps) dc_sorted = sorted(nx.degree_centrality(G).items(), key=itemgetter(0), reverse=True) bc_sorted = sorted(nx.betweenness_centrality(G).items(), key=itemgetter(0), reverse=True) clc_sorted = sorted(nx.closeness_centrality(G).items(), key=itemgetter(0), reverse=True) coc_sorted = sorted(nx.communicability_centrality(G).items(), key=itemgetter(0), reverse=True) lc_sorted = sorted(nx.load_centrality(G).items(), key=itemgetter(0), reverse=True) cfbc_sorted = sorted(nx.current_flow_betweenness_centrality(G).items(), key=itemgetter(0), reverse=True) cfcc_sorted = sorted(nx.current_flow_closeness_centrality(G).items(), key=itemgetter(0), reverse=True) # print ec_sorted[0] developer_centrality = [] developer_file = file("public/wordpress/developer.json") developers = json.load(developer_file) for developer in developers: degree = 0 betweenness = 0 closeness = 0 communicability = 0 load = 0 current_flow_betweenness = 0 current_flow_closeness = 0 for i in range(0, len(dc_sorted)): # if ( not dc_sorted[i][0] == bc_sorted[i][0] == clc_sorted[i][0] == coc_sorted[i][0] == lc_sorted[i][0] == cfbc_sorted[i][0]): # print 'false' if (developer['developer'] == dc_sorted[i][0]): degree = dc_sorted[i][1] betweenness = bc_sorted[i][1] closeness = clc_sorted[i][1] communicability = coc_sorted[i][1] load = lc_sorted[i][1] current_flow_betweenness = cfbc_sorted[i][1] current_flow_closeness = cfcc_sorted[i][1] developer_centrality.append({ 'name': developer['developer'], 'degree': degree, 'betweenness': betweenness, 'closeness': closeness, 'communicability': communicability, 'load': load, 'current_flow_betweenness': current_flow_betweenness, 'current_flow_closeness': current_flow_closeness, }) return developer_centrality
details = f.read().split('\n') for line in details: if len(line) > 0 and line[0] != '#': node0 = line.split(' ')[0] node1 = line.split(' ')[1].split('\t')[0] weight = line.split('\t')[1] collab_graph.add_edge(int(node0), int(node1), weight=int(weight)) distances = distance_from_erdos(collab_graph, 1095) output_centralities(distances, 'Distance_From_Erdos') # Calculate and output the centralities degree_centrality = nx.degree_centrality(collab_graph) output_centralities(degree_centrality, 'Degree_Centrality') degree_centrality_weighted = weighted_degree_centrality(collab_graph) output_centralities(degree_centrality_weighted, 'Weighted_Degree_Centrality') normalised_closeness_centrality = nx.closeness_centrality(collab_graph) output_centralities(normalised_closeness_centrality, "Closeness_Centrality") betweenness_centrality = nx.betweenness_centrality(collab_graph, endpoints=True) output_centralities(betweenness_centrality, "Betweenness_Centrality") katz_centrality = nx.katz_centrality(collab_graph, alpha=0.005) output_centralities(katz_centrality, "Katz_Centrality") communicability_centrality = nx.communicability_centrality(collab_graph) output_centralities(communicability_centrality, "Communicability_Centrality")
def avg_communicability_centrality(self): """ Communicability centrality, also called subgraph centrality, of a node n is the sum of closed walks of all lengths starting and ending at node n. """ return sum(nx.communicability_centrality(self.graph).values()) / self.n
n = 80 p = 10. / n G = nx.fast_gnp_random_graph(n, p, seed=42) def to_list(dict_): return [dict_[k] for k in G.nodes()] graph_colors = [ ("degree", to_list(nx.degree_centrality(G))), ("betweenness", to_list(nx.betweenness_centrality(G))), ("load", to_list(nx.load_centrality(G))), ("eigenvector", to_list(nx.eigenvector_centrality_numpy(G))), ("closeness_centrality", to_list(nx.closeness_centrality(G))), ("current_flow_closeness", to_list(nx.current_flow_closeness_centrality(G))), ("current_flow_betweenness", to_list(nx.current_flow_betweenness_centrality(G))), ("katz", to_list(nx.katz_centrality_numpy(G))), ("communicability", to_list(nx.communicability_centrality(G))), ] fig = plot_multigraph.plot_color_multigraph(G, graph_colors, 3, 3, node_size=50) plt.savefig('graphs/centrality.png', facecolor=fig.get_facecolor())
def g2o(input_graph, degree_threshold, step_size, heuristic="degree"): ## heuristic selection if heuristic == "degree": heuristic_hash = input_graph.degree() elif heuristic == "pagerank": heuristic_hash = nx.pagerank_numpy(input_graph, alpha=0.9) elif heuristic == "pagerank_scipy": heuristic_hash = nx.pagerank_scipy(input_graph, alpha=0.9) elif heuristic == "eigenvector": heuristic_hash = nx.eigenvector_centrality_numpy(input_graph) elif heuristic == "communicability": heuristic_hash = nx.communicability_centrality(input_graph) elif heuristic == "flow_betweenness": heuristic_hash = nx.current_flow_betweenness_centrality(input_graph) elif heuristic == "closeness": heuristic_hash = nx.closeness_centrality(input_graph) elif heuristic == "betweenness": heuristic_hash = nx.betweenness_centrality(input_graph) else: raise ValueError("Please select a valid heuristic..") ## first identify the triplets G = input_graph result_triplets = [] crossed = set() for node in G: crossed.add(node) done_count = set() neighbours = set(G[node]) for neigh in neighbours: if neigh in crossed: continue done_count.add(neigh) for both in neighbours.intersection(G[neigh]): if both in crossed or both in done_count: continue result_triplets.append((node, neigh, both)) ## remove triplets in some manner for triplet in result_triplets: ## get the node degrees triplet_degrees = {heuristic_hash[node]: node for node in triplet} sorted_keys = sorted(list(triplet_degrees.keys())) if len(sorted_keys) == 3: try: input_graph.remove_edge(triplet_degrees[sorted_keys[0]], triplet_degrees[sorted_keys[1]]) except: ## not all keys exist pass outgraph = nx.DiGraph() degree_list = [heuristic_hash[deg] for deg in heuristic_hash] threshold_degree = np.percentile(degree_list, degree_threshold) candidate_hotspots = [ node for node, value in heuristic_hash.items() if value > threshold_degree ] print("Nodes to begin the iteration: ", len(candidate_hotspots)) ## a queue of nodes to be processed.. to_process = [] ## a list of already processed nodes.. already_processed = [] ## initiate the nodes for node in candidate_hotspots: to_process.insert(0, node) while len(to_process) != 0: for step in (range(0, int(step_size))): ## go to a specific depth if len(to_process) != 0: start_node = to_process.pop() else: break if start_node not in already_processed: already_processed.append(start_node) for neigh in set(input_graph[start_node]): if neigh not in already_processed and neigh not in candidate_hotspots: ## Querying if step > 0: to_process.append(neigh) else: to_process.insert(0, neigh) ## Edge construction step if heuristic_hash[neigh] < heuristic_hash[start_node]: outgraph.add_edge(start_node, neigh) else: outgraph.add_edge(neigh, start_node) print(nx.info(outgraph)) if nx.is_directed_acyclic_graph(outgraph): return outgraph else: raise ValueError('Graph could not be converted to a DAG.')
import networkx as nx import plot_multigraph import matplotlib.pylab as plt from matplotlib import pylab as plt n = 80 p = 10. / n G = nx.fast_gnp_random_graph(n, p, seed=42) def to_list(dict_): return [dict_[k] for k in G.nodes()] graph_colors = [ ("degree", to_list(nx.degree_centrality(G))), ("betweenness", to_list(nx.betweenness_centrality(G))), ("load", to_list(nx.load_centrality(G))), ("eigenvector", to_list(nx.eigenvector_centrality_numpy(G))), ("closeness_centrality", to_list(nx.closeness_centrality(G))), ("current_flow_closeness", to_list(nx.current_flow_closeness_centrality(G))), ("current_flow_betweenness", to_list(nx.current_flow_betweenness_centrality(G))), ("katz", to_list(nx.katz_centrality_numpy(G))), ("communicability", to_list(nx.communicability_centrality(G))), ] fig = plot_multigraph.plot_color_multigraph(G, graph_colors, 3, 3, node_size=50) plt.savefig('graphs/centrality.png', facecolor=fig.get_facecolor())
def com_center(net): return distriCentra(nx.communicability_centrality(net).values(), nx.communicability_centrality(star(net)).values(), 'communicability')
def make_net(centrality_name, in_path, out_path): #sample code #import _2_time_based_data_network_feature #make_net_in_path = "../3.time_based_data/1.cite_relation_devide/" #make_net_out_path = "../3.time_based_data/2.centrality_data/" #_2_time_based_data.make_net( "in_degree", make_net_in_path, make_net_out_path) #네트워크를 만들고 Centurality를 계산하고 저장할 것이다. import networkx as nx global Dump Dump = {} make_net_initialize(in_path) start_time = time.time() temp_start_time = time.time() print "============= make_net start:" + centrality_name + " ==============" print "============= from 1951 to 2015 ==============" for year in range(1951, 2016): print year f_in = open(in_path + str(year) + "_cite.csv","r") lines = f_in.readlines() f_in.close() edge_list = [] for line in lines: data = line.split(",") data_tuple = (data[0].strip(), data[1].strip()) edge_list.append(data_tuple) Net = nx.DiGraph(edge_list) Cen_in = {} if (centrality_name == "in_degree"): Cen_in = nx.in_degree_centrality(Net) elif (centrality_name == "degree"): Cen_in = nx.degree_centrality(Net) elif (centrality_name == "eigenvector"): Cen_in = nx.eigenvector_centrality_numpy(Net) elif (centrality_name == "katz"): Cen_in = nx.katz_centrality(Net) elif (centrality_name == "pagerank"): Cen_in = nx.pagerank(Net) elif (centrality_name == "communicability"): Net = nx.Graph(edge_list) Cen_in = nx.communicability_centrality(Net) elif (centrality_name == "load"): Cen_in = nx.load_centrality(Net) for j in Cen_in: key = j val = Cen_in[j] Dump[key][year] = val #저장하는 코드 f_out = open(out_path + centrality_name +"_centrality.csv", "w") for key in Dump: line = str(key) for year in range(1951, 2016): data = Dump[key].get(year, 0) line = line + ","+ str(data) line = line + "\n" f_out.write(line) f_out.close() print "============= make_net end ==============" print(centrality_name + "takes %s seconds" % (time.time() - temp_start_time)) temp_start_time = time.time()
def com_center(net): return distriCentra( nx.communicability_centrality(net).values(), nx.communicability_centrality(star(net)).values(), 'communicability')
sys.stdout.write(" done\n") sys.stdout.write("calculating PageRank Centrality . . .") PageRankDict = nx.pagerank(G) sys.stdout.write(" done\n") sys.stdout.write("calculating Closeness Centrality . . .") ClosenessDict = nx.closeness_centrality(G) sys.stdout.write(" done\n") sys.stdout.write("calculating Betweenness Centrality . . .") BetweennessDict = nx.betweenness_centrality(G) sys.stdout.write(" done\n") sys.stdout.write("calculating Communicability Centrality . . .") CommunicabilityDict= nx.communicability_centrality(G) sys.stdout.write(" done\n") print "="*100 import re csvRegion = list(csv.reader(open("Regions.csv"))) regions = {} for i in range(1,len(csvRegion)) : regions[csvRegion[i][2].strip()] = [re.sub(r'(,)', '/',csvRegion[i][3]), re.sub(r'(,)', '/',csvRegion[i][4])] import matplotlib.pyplot as plt nx.draw(G) # networkx draw() plt.show() sys.stdout.write("Clustering by depth 3 . . .")
def communicability_centrality_sum(self): if (self.communicability_centrality_dict == None): self.communicability_centrality_dict = nx.communicability_centrality(self.graph) return self.communicability_centrality_dict[self.node_1] + self.communicability_centrality_dict[self.node_2]