def augmentNodes(g):
    r1 = nx.eigenvector_centrality_numpy(g)
    r2 = nx.degree_centrality(g) # DP MY
    r3 = nx.betweenness_centrality(g)
    r5 = nx.load_centrality(g,weight='weight') # DY, WY-writename # Scientific collaboration networks: II. Shortest paths, weighted networks, and centrality, M. E. J. Newman, Phys. Rev. E 64, 016132 (2001).
    r6 = nx.pagerank(g, alpha=0.85, personalization=None, max_iter=100, tol=1e-08, nstart=None, weight='weight')
    
    if nx.is_directed(g) == True:
        r8 = nx.in_degree_centrality(g)
        r9 = nx.out_degree_centrality(g)
#        r10 = nx.hits(g, max_iter=100, tol=1e-08, nstart=None)
    else:
        r4 = nx.communicability_centrality(g)
        r7 = nx.clustering(g, weight='weight')
        
    for x in g.nodes():
        g.node[x]['eigenvector_centrality_numpy'] = r1[x]
        g.node[x]['degree_centrality'] = r2[x]  
        g.node[x]['betweenness_centrality'] = r3[x]
        g.node[x]['load_centrality'] = r5[x]  
        g.node[x]['pagerank'] = r6[x]

        if nx.is_directed(g) == True:
            g.node[x]['in_degree_centrality'] = r8[x]
            g.node[x]['out_degree_centrality'] = r9[x]
#            g.node[x]['hits'] = r10[x]
        else:
            g.node[x]['communicability_centrality'] = r4[x]
            g.node[x]['clustering'] = r7[x]
    return g        
def centrality(net):
    values ={}
    close = nx.closeness_centrality(net, normalized= True)
    eigen = nx.eigenvector_centrality_numpy(net)
    page = nx.pagerank(net)
    bet = nx.betweenness_centrality(net,normalized= True)
    flow_c = nx.current_flow_closeness_centrality(net,normalized= True)
    flow_b = nx.current_flow_betweenness_centrality(net,normalized= True)
    load = nx.load_centrality(net, normalized = True)
    com_c = nx.communicability_centrality(net)
    com_b = nx.communicability_betweenness_centrality(net, normalized= True)
    degree = net.degree()
    
    file3 = open("bl.csv",'w')
    for xt in [bet,load,degree,page,flow_b,com_c,com_b,eigen,close,flow_c]:#[impo,bet,flow_b,load,com_c,com_b] :
        for yt in [bet,load,degree,page,flow_b,com_c,com_b,eigen,close,flow_c]:#[impo,bet,flow_b,load,com_c,com_b] :
            corr(xt.values(),yt.values(),file3)
        print
        file3.write("\n")
    file3.close()
    #plt.plot(x,y, 'o')
    #plt.plot(x, m*x + c, 'r', label='Fitted line')
    #plt.show()
    #for key,item in close.iteritems() :
        #values[key] = [impo.get(key),bet.get(key),flow_b.get(key), load.get(key),com_c.get(key),com_b.get(key)]
        
    return values
def centrality(net):
    values = {}
    close = nx.closeness_centrality(net, normalized=True)
    eigen = nx.eigenvector_centrality_numpy(net)
    page = nx.pagerank(net)
    bet = nx.betweenness_centrality(net, normalized=True)
    flow_c = nx.current_flow_closeness_centrality(net, normalized=True)
    flow_b = nx.current_flow_betweenness_centrality(net, normalized=True)
    load = nx.load_centrality(net, normalized=True)
    com_c = nx.communicability_centrality(net)
    com_b = nx.communicability_betweenness_centrality(net, normalized=True)
    degree = net.degree()

    file3 = open("bl.csv", 'w')
    for xt in [
            bet, load, degree, page, flow_b, com_c, com_b, eigen, close, flow_c
    ]:  #[impo,bet,flow_b,load,com_c,com_b] :
        for yt in [
                bet, load, degree, page, flow_b, com_c, com_b, eigen, close,
                flow_c
        ]:  #[impo,bet,flow_b,load,com_c,com_b] :
            corr(xt.values(), yt.values(), file3)
        print
        file3.write("\n")
    file3.close()
    #plt.plot(x,y, 'o')
    #plt.plot(x, m*x + c, 'r', label='Fitted line')
    #plt.show()
    #for key,item in close.iteritems() :
    #values[key] = [impo.get(key),bet.get(key),flow_b.get(key), load.get(key),com_c.get(key),com_b.get(key)]

    return values
Esempio n. 4
0
def features_matrix(graph, anchors, use_dist=True, use_pgrs=True,
                    use_pgr=True, use_comm=False, use_comm_centr=False):
    node_feats = []
    n = len(graph)
    if use_dist:
        dists = nx.all_pairs_shortest_path_length(graph)
    if use_pgr:
        pageranks = nx.pagerank_numpy(graph)
    if use_pgrs:
        pgr_anchor = [anchored_pagerank(graph, anchor) for anchor in anchors]
    if use_comm_centr:
        communicability_centrality = nx.communicability_centrality(graph)
    if use_comm:
        communicability = nx.communicability(graph)

    for node in graph.nodes():
        assert node == len(node_feats)
        feats = []
        if use_dist:
            feats += [dists[node][anchor] for anchor in anchors]
        if use_pgrs:
            feats += [pgr[node]*n for pgr in pgr_anchor]
        if use_pgr:
            feats.append(pageranks[node]*n)
        if use_comm_centr:
            feats.append(communicability_centrality[node])
        if use_comm:
            feats += [communicability[node][anchor] for anchor in anchors]


        node_feats.append(np.array(feats))
    return node_feats
Esempio n. 5
0
 def subgraph_centrality(self, graph):
     x = nx.communicability_centrality(graph)
     z = 0
     y = len(x)
     for key, value in x.iteritems():
         z += value
     return (z / y)
Esempio n. 6
0
 def run(self, graph, slope):
     communicability_data = nx.communicability_centrality(graph)
     weights = {}
     max_comm_for_normaliz = max(communicability_data.values())
     for node, commu in communicability_data.items():
         weights[node] = slope[node] * commu / max_comm_for_normaliz
     return weights
Esempio n. 7
0
 def communicability_centrality_sum(self):
     if (self.communicability_centrality_dict == None):
         self.communicability_centrality_dict = nx.communicability_centrality(
             self.graph)
         time.sleep(1)
     return self.communicability_centrality_dict[
         self.node_1] + self.communicability_centrality_dict[self.node_2]
Esempio n. 8
0
def node_communicability_centrality(X):
    """
    based on networkx function: communicability_centrality
    """
    XX = np.zeros((X.shape[0], np.sqrt(X.shape[1])))
    for i, value in enumerate(X):
        adj_mat = value.reshape((np.sqrt(len(value)),-1))
        adj_mat = (adj_mat - np.min(adj_mat)) / (np.max(adj_mat) - np.min(adj_mat))
        adj_mat = 1 - adj_mat

#        th = np.mean(adj_mat) - 0.1
#        adj_mat = np.where(adj_mat < th, adj_mat, 0.)

        percent, th, adj_mat, triu = percentage_removed(adj_mat, 0.76) #96
        print("percent = {0}, threshold position = {1}, threshold = {2}\n".format(percent, th, triu[th]))

        g = nx.from_numpy_matrix(adj_mat)
        print "Graph Nodes = {0}, Graph Edges = {1} ".format(g.number_of_nodes(), g.number_of_edges())
        print "\nEdge kept ratio, {0}".format(float(g.number_of_edges())/((g.number_of_nodes()*(g.number_of_nodes()-1))/2))

        deg_cent = nx.communicability_centrality(g)
        node_cent = np.zeros(g.number_of_nodes())

        for k in deg_cent:
            node_cent[k] = deg_cent[k]
        XX[i] = node_cent
        print "graph {0} => mean {1}, min {2}, max {3}".format(i, np.mean(XX[i]), np.min(XX[i]), np.max(XX[i]))
#    XX = XX*100
    ss = StandardScaler()
    XX = ss.fit_transform(XX.T).T

    return XX
Esempio n. 9
0
    def process(self):

        for date in self.dates:

            print "Year of analysis " + str(date)

            rst = []

            with open(self.srcdir + str(date) + '.csv', 'rU') as f:
                rows = csv.reader(f, dialect='excel', delimiter=';')
                next(rows, None)
                for row in rows:

                    self.G.add_node(row[0])
                    self.G.add_node(row[1])
                    self.G.add_edge(row[0], row[1], weight=float(row[2]))
                    # self.G.add_edge(row[0], row[1])
                    # self.G.add_edge(row[0], row[1], weight=abs(float(float(row[3])/float(row[2]))))

            rst.append(nx.eigenvector_centrality_numpy(self.G))
            rst.append(nx.betweenness_centrality(self.G))
            rst.append(nx.closeness_centrality(self.G))
            rst.append(nx.degree_centrality(self.G))
            rst.append(nx.communicability_centrality(self.G))

            self._save_result(date, rst)
            self._save_graph_gexf(date)
Esempio n. 10
0
def features_dict(graph, anchors, use_dist=True, use_pgrs=True,
                    use_pgr=True, use_comm=False, use_comm_centr=False):
    node_feats = {}
    n = len(graph)
    if use_dist:
        # dists = nx.all_pairs_shortest_path_length(graph)
        dists = dists_to_anchors(graph, anchors)
    if use_pgr:
        pageranks = nx.pagerank_numpy(graph)
    if use_pgrs:
        # pgr_anchor = [anchored_pagerank(graph, anchor) for anchor in anchors]
        pgr_anchor = pageranks_to_anchors(graph, anchors)
    if use_comm_centr:
        communicability_centrality = nx.communicability_centrality(graph)
    if use_comm:
        communicability = nx.communicability(graph)

    for node in graph.nodes():
        feats = []
        if use_dist:
            feats += [dists[node][anchor] for anchor in anchors]
        if use_pgrs:
            feats += [pgr_anchor[anchor][node]*n
                      for anchor in range(len(anchors))]
            # feats += [pgr[node]*n for pgr in pgr_anchor]
        if use_pgr:
            feats.append(pageranks[node]*n)
        if use_comm_centr:
            feats.append(communicability_centrality[node])
        if use_comm:
            feats += [communicability[node][anchor] for anchor in anchors]


        node_feats[node] = np.array(feats)
    return node_feats
def communicability_centrality(gnx, f, ft):
    start = timer.start(ft, 'load_centrality')
    communicability_centrality_dict = nx.communicability_centrality(gnx)
    timer.stop(ft, start)
    for k in communicability_centrality_dict:
        f.writelines(
            str(k) + ',' + str(communicability_centrality_dict[k]) + '\n')
    return communicability_centrality_dict
Esempio n. 12
0
    def communicability_centrality(self):
        """Return communicability centrality for each node in G.

        If is the graph is directed it will be converted to undirected.

        Returns
        -------
        nodes: dictionary
            Dictionary of nodes with communicability centrality as the value.

        Examples
        --------
        >>>
        """
        if self.is_directed:
            return nx.communicability_centrality(self._graph.to_undirected())
        else:
            return nx.communicability_centrality(self._graph)
def calculate(network):
    try:
        n = nx.communicability_centrality(network)
    except:
        return 0

    if len(n.values()) == 0:
        return 0
    else:
        return round(sum(n.values()) / len(n.values()), 7)
    def forUndirected(G):

        myList = [nx.eigenvector_centrality_numpy(G), 
                  nx.degree_centrality(G),
                  nx.betweenness_centrality(G),
                  nx.communicability_centrality(G), 
                  nx.load_centrality(G),   
                  nx.pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1e-08, nstart=None, weight='weight'),
                  nx.clustering(G, weight='weight')]
        return myList
def process_data(denom=100000, round=0):
	f = csv.reader(open("../applab_new_6.csv", 'rb'), delimiter=',')
	db = nx.DiGraph()
	full_users = set()
	i = 0
	uniquect = 0
	for line in f:
		if i % 100000 == 0 : print "processed", i, "lines"
		if i == 1000: break
		sender, receiver, date, time, duration, cost, location, region = map(lambda x: x.strip(), line)
		if sender not in full_users:
			uniquect += 1
			full_users.add(sender)
			if uniquect <= 2: #% denom - round == 0:
				db.add_node(sender)
				if db.has_node(receiver) == False:
					db.add_node(receiver)
		else:
			if db.has_node(receiver) == False:
				db.add_node(receiver)

		if db.has_edge(sender, receiver):
			db[sender][receiver]['weight'] += int(duration)
		else:
			db.add_edge(sender, receiver, weight=int(duration))
		i+=1
	#pickle.dump(db, open("users_networkx.p" % str(round), "wb"))
	#print "degree assortativity coeff:", nx.degree_assortativity_coefficient(db)
	#print "average degree connectivity:", nx.average_degree_connectivity(db)
	#	print "k nearest neighbors:", nx.k_nearest_neighbors(db)
	print "calculating deg cent"
	deg_cent = nx.degree_centrality(db) #sorted(nx.degree_centrality(db).items(), key=lambda x: x[1])
	print "calculating in deg cent"
	in_deg_cent = nx.in_degree_centrality(db) #sorted(nx.in_degree_centrality(db).items(), key=lambda x: x[1])
	print "calculating out deg cent"
	out_deg_cent = nx.out_degree_centrality(db) #sorted(nx.out_degree_centrality(db).items(), key=lambda x: x[1])
	print "closeness cent"
	closeness_cent = nx.closeness_centrality(db) #sorted(nx.closeness_centrality(db).items(), key=lambda x: x[1])
	#print "betweenness cent"
	#btwn_cent = nx.betweenness_centrality(db) #sorted(nx.betweenness_centrality(db).items(), key=lambda x: x[1])
	print "done"
	w = open("../output/user_network_stats.csv", 'w')
	w.write("uid,deg_cent,in_deg_cent,out_deg_cent,closeness_cent,btwn_cent\n")
	for user in deg_cent.keys():
		try:
			w.write("%s,%s,%s,%s,%s\n" % (user, deg_cent[user], in_deg_cent[user], out_deg_cent[user], closeness_cent[user]))
		except: pass
	w.close()
	print "drawing..."
	nx.draw(db)
	plt.savefig("path.pdf")
	print "done!"
	print "edge betweenness centrality:", nx.edge_betweenness_centrality(db)
	print "communicability:", nx.communicability(db)
	print "communicability centrality:", nx.communicability_centrality(db)
Esempio n. 16
0
    def centrality(self):
        """
        calculates several measures of node centrality and stores them in the general node table
        """
        speciesnodes =  set(n for n, d in self.graph.nodes(data=True) if d['graphics']['type']=='roundrectangle')

        g2 = nx.Graph(self.graph)
        self.nodes['degree'] = pandas.Series(nx.degree_centrality(self.graph))
        self.nodes['closeness'] = pandas.Series(nx.closeness_centrality(self.graph))
        self.nodes['betweenness'] = pandas.Series(nx.betweenness_centrality(self.graph))
        self.nodes['communicability'] = pandas.Series(nx.communicability_centrality(g2))
Esempio n. 17
0
    def centrality(self):
        """
        calculates several measures of node centrality and stores them in the general node table
        """
        speciesnodes = set(n for n, d in self.graph.nodes(data=True)
                           if d['graphics']['type'] == 'roundrectangle')

        g2 = nx.Graph(self.graph)
        self.nodes['degree'] = pandas.Series(nx.degree_centrality(self.graph))
        self.nodes['closeness'] = pandas.Series(
            nx.closeness_centrality(self.graph))
        self.nodes['betweenness'] = pandas.Series(
            nx.betweenness_centrality(self.graph))
        self.nodes['communicability'] = pandas.Series(
            nx.communicability_centrality(g2))
Esempio n. 18
0
def calculatecommunicabilitycentrality(network):
    '''
    Communicability centrality, also called subgraph centrality, of a node n is the sum of closed walks of all lengths starting and ending at node n.


    '''
    try:
        n = nx.communicability_centrality(network)
    except:
        return 0
 
    if len(n.values()) == 0: 
        return 0  Q
    else:
        return round(sum(n.values())/len(n.values()), 7)
def calculate_centrality(G):
	# dc_dumps = json.dumps(nx.degree_centrality(G).items(),sort_keys=True,indent=4)
	# dc_loads = json.loads(dc_dumps)
	dc_sorted = sorted(nx.degree_centrality(G).items(), key=itemgetter(0), reverse=True)
	bc_sorted = sorted(nx.betweenness_centrality(G).items(), key=itemgetter(0), reverse=True)
	clc_sorted = sorted(nx.closeness_centrality(G).items(), key=itemgetter(0), reverse=True)
	coc_sorted = sorted(nx.communicability_centrality(G).items(), key=itemgetter(0), reverse=True)
	lc_sorted = sorted(nx.load_centrality(G).items(), key=itemgetter(0), reverse=True)
	cfbc_sorted = sorted(nx.current_flow_betweenness_centrality(G).items(), key=itemgetter(0), reverse=True)
	cfcc_sorted = sorted(nx.current_flow_closeness_centrality(G).items(), key=itemgetter(0), reverse=True)
	# print ec_sorted[0]
	
	developer_centrality = []

	developer_file = file("public/wordpress/developer.json")
	developers = json.load(developer_file)
	for developer in developers:
		degree = 0
		betweenness = 0
		closeness = 0
		communicability = 0
		load = 0
		current_flow_betweenness = 0
		current_flow_closeness = 0
		for i in range (0, len(dc_sorted)):
			# if ( not dc_sorted[i][0] == bc_sorted[i][0] == clc_sorted[i][0] == coc_sorted[i][0] == lc_sorted[i][0] == cfbc_sorted[i][0]):
			# 	print 'false'
			if( developer['developer'] == dc_sorted[i][0]):
				degree = dc_sorted[i][1]
				betweenness = bc_sorted[i][1]
				closeness = clc_sorted[i][1]
				communicability = coc_sorted[i][1]
				load = lc_sorted[i][1]
				current_flow_betweenness = cfbc_sorted[i][1]
				current_flow_closeness = cfcc_sorted[i][1]

		developer_centrality.append({
			'name': developer['developer'],
		 	'degree': degree,
			'betweenness': betweenness,
			'closeness': closeness,
			'communicability': communicability,
			'load': load,
			'current_flow_betweenness': current_flow_betweenness,
			'current_flow_closeness':current_flow_closeness,
		 })

	return developer_centrality
Esempio n. 20
0
def set_capacities_communicability_gravity(topology, capacities, 
                                           capacity_unit='Mbps'):
    """
    Set link capacities proportionally to the product of the communicability
    centralities of the two end-points of the link
    
    Parameters
    ----------
    topology : Topology
        The topology to which link capacities will be set
    capacities : list
        A list of all possible capacity values
    capacity_unit : str, optional
        The unit in which capacity value is expressed (e.g. Mbps, Gbps etc..)
    """
    centrality = nx.communicability_centrality(topology)
    _set_capacities_gravity(topology, capacities, centrality, capacity_unit)
Esempio n. 21
0
def set_capacities_communicability_gravity(topology, capacities,
                                           capacity_unit='Mbps'):
    """
    Set link capacities proportionally to the product of the communicability
    centralities of the two end-points of the link

    Parameters
    ----------
    topology : Topology
        The topology to which link capacities will be set
    capacities : list
        A list of all possible capacity values
    capacity_unit : str, optional
        The unit in which capacity value is expressed (e.g. Mbps, Gbps etc..)
    """
    centrality = nx.communicability_centrality(topology)
    _set_capacities_gravity(topology, capacities, centrality, capacity_unit)
Esempio n. 22
0
def cal_communicability_centrality(fn1,fn2):
    edges=prep.read_edges(fn2)
    sth={edge:0. for edge in edges}
   
    G=nx.Graph()
    edges_all=prep.read_edges(fn1)
    G.add_edges_from(edges_all)
    communicability=nx.communicability_centrality(G)

    for x in sth:
        n1= communicability.get(x[0])
        n2= communicability.get(x[1])
        print n1,n2
        n3=max(n1,n2)
        n4=min(n1,n2)
        sth[x]=float(n4)/(n3+1)
#        sth[x]=n1*n2
#        sth[x]=n1+n2
    return sth                                                         
Esempio n. 23
0
def node_communicability_centrality(X):
    """
    based on networkx function: communicability_centrality
    """
    XX = np.zeros((X.shape[0], np.sqrt(X.shape[1])))
    for i, value in enumerate(X):
        adj_mat = value.reshape((np.sqrt(len(value)), -1))
        adj_mat = (adj_mat - np.min(adj_mat)) / (np.max(adj_mat) -
                                                 np.min(adj_mat))
        adj_mat = 1 - adj_mat

        #        th = np.mean(adj_mat) - 0.1
        #        adj_mat = np.where(adj_mat < th, adj_mat, 0.)
        print("\n========== Node communicability centrality ==========\n")
        percent, th, adj_mat, triu = percentage_removed(adj_mat, 0.76)  #96
        print("percent = {0}, threshold position = {1}, threshold = {2}\n".
              format(percent, th, triu[th]))

        g = nx.from_numpy_matrix(adj_mat)
        print "Graph Nodes = {0}, Graph Edges = {1} ".format(
            g.number_of_nodes(), g.number_of_edges())
        print "\nEdge kept ratio, {0}".format(
            float(g.number_of_edges()) / ((g.number_of_nodes() *
                                           (g.number_of_nodes() - 1)) / 2))

        deg_cent = nx.communicability_centrality(g)
        node_cent = np.zeros(g.number_of_nodes())

        for k in deg_cent:
            node_cent[k] = deg_cent[k]
        XX[i] = node_cent
        print "graph {0} => mean {1}, min {2}, max {3}".format(
            i, np.mean(XX[i]), np.min(XX[i]), np.max(XX[i]))


#    XX = XX*100
    ss = StandardScaler()
    XX = ss.fit_transform(XX.T).T

    return XX
Esempio n. 24
0
def communicativity(G, k):

    G.remove_edges_from(G.selfloop_edges())
    com = nx.communicability_centrality(G)
    counter = 0

    bet_dict = {}
    for edg in G.edges():
        ini = edg[0]
        fin = edg[1]
        value = com[ini] + com[fin]
        bet_dict.update({edg: value})

    infected = classify_edges(G)

    final_dict = {}

    for val in infected:
        final_dict.update({val: bet_dict[val]})

    sorted_dict = sorted(final_dict.items(), key=operator.itemgetter(1))
    counter = 0
    iter = 1
    print len(sorted_dict)
    while counter <= k:
        val = sorted_dict[-iter]
        edg = val[0]
        ini = edg[0]
        fin = edg[1]

        if G.has_edge(ini, fin):
            G.remove_edge(ini, fin)
            counter = counter + 1

        iter = iter + 1

    return G
def make_net(centrality_name, in_path, out_path):
    #sample code
    #import _2_time_based_data_network_feature
    #make_net_in_path = "../3.time_based_data/1.cite_relation_devide/"
    #make_net_out_path = "../3.time_based_data/2.centrality_data/"
    #_2_time_based_data.make_net( "in_degree", make_net_in_path, make_net_out_path)

    #네트워크를 만들고 Centurality를 계산하고 저장할 것이다.
    import networkx as nx
    global Dump
    Dump = {}
    make_net_initialize(in_path)
    start_time = time.time()
    temp_start_time = time.time()

    print "=============		make_net start:" + centrality_name + "		=============="
    print "=============		from 1951 to 2015		=============="

    for year in range(1951, 2016):
        print year
        f_in = open(in_path + str(year) + "_cite.csv", "r")
        lines = f_in.readlines()
        f_in.close()
        edge_list = []

        for line in lines:
            data = line.split(",")
            data_tuple = (data[0].strip(), data[1].strip())
            edge_list.append(data_tuple)

        Net = nx.DiGraph(edge_list)
        Cen_in = {}
        if (centrality_name == "in_degree"):
            Cen_in = nx.in_degree_centrality(Net)
        elif (centrality_name == "degree"):
            Cen_in = nx.degree_centrality(Net)
        elif (centrality_name == "eigenvector"):
            Cen_in = nx.eigenvector_centrality_numpy(Net)
        elif (centrality_name == "katz"):
            Cen_in = nx.katz_centrality(Net)
        elif (centrality_name == "pagerank"):
            Cen_in = nx.pagerank(Net)
        elif (centrality_name == "communicability"):
            Net = nx.Graph(edge_list)
            Cen_in = nx.communicability_centrality(Net)
        elif (centrality_name == "load"):
            Cen_in = nx.load_centrality(Net)

        for j in Cen_in:
            key = j
            val = Cen_in[j]
            Dump[key][year] = val

    #저장하는 코드
    f_out = open(out_path + centrality_name + "_centrality.csv", "w")
    for key in Dump:
        line = str(key)
        for year in range(1951, 2016):
            data = Dump[key].get(year, 0)
            line = line + "," + str(data)
        line = line + "\n"
        f_out.write(line)
    f_out.close()

    print "=============		make_net end			=============="
    print(centrality_name + "takes %s seconds" %
          (time.time() - temp_start_time))
    temp_start_time = time.time()
Esempio n. 26
0
    def makeJSON(self):
        global info
        info = ""
        info += "{\n \"info\": {\n\"nodes\": [\n"
        i = 0
        num = len(s_partition)
        sorted_betweeness = []
        sorted_degree = []
        sorted_eigenvector = []
        sorted_closeness = []
        sorted_harmonic = []
        sorted_communicability = []
        sorted_core = []
        sorted_degree1 = []
        sorted_partition = s_partition
        unadjusted_betweeness = []
        unadjusted_degree = []
        unadjusted_eigenvctor = []
        unadjusted_closeness = []
        unadjusted_harmonic = []
        unadjusted_communicability = []

        G.remove_edges_from(G.selfloop_edges())
        for key, value in nx.betweenness_centrality(G).items():
            value1 = 1 + (value * 100)
            temp1 = [key, value]
            temp = [key, value1]
            sorted_betweeness.append(temp1)
            unadjusted_betweeness.append(temp1)
        sorted_partition = sorted(sorted_partition)
        sorted_betweeness = sorted(sorted_betweeness)
        unadjusted_betweeness = sorted(unadjusted_betweeness)

        for key, value in nx.degree_centrality(G).items():
            value1 = 1 + (value * 100)
            temp1 = [key, value]
            temp = [key, value1]
            sorted_degree.append(temp1)
            unadjusted_degree.append(temp1)
        sorted_degree = sorted(sorted_degree)
        unadjusted_degree = sorted(unadjusted_degree)

        for key, value in nx.eigenvector_centrality(G).items():
            value1 = value * 1000
            temp1 = [key, value]
            temp = [key, value1]
            sorted_eigenvector.append(temp1)
            unadjusted_eigenvctor.append(temp1)
        sorted_eigenvector = sorted(sorted_eigenvector)
        unadjusted_eigenvector = sorted(unadjusted_eigenvctor)

        for key, value in nx.closeness_centrality(G).items():
            value1 = (value * 10)
            temp1 = [key, value]
            temp = [key, value1]
            sorted_eigenvector.append(temp1)
            unadjusted_eigenvctor.append(temp1)
        sorted_eigenvector = sorted(sorted_eigenvector)

        for key, value in nx.harmonic_centrality(G).items():
            temp1 = [key, value]

            sorted_harmonic.append(temp1)
            unadjusted_eigenvctor.append(temp1)
        sorted_harmonic = sorted(sorted_harmonic)
        unadjusted_harmonic = sorted(unadjusted_eigenvctor)

        for key, value in nx.communicability_centrality(G).items():
            temp1 = [key, value]
            sorted_communicability.append(temp1)
            unadjusted_eigenvctor.append(temp1)
        sorted_communicability = sorted(sorted_communicability)
        unadjusted_communicability = sorted(unadjusted_eigenvctor)

        for key, value in nx.core_number(G).items():  #list

            temp = [key, value]
            sorted_core.append(temp)
        sorted_core = sorted(sorted_core)

        for key, value in nx.degree(G).items():  #list

            temp = [key, value]
            sorted_degree1.append(temp)
        sorted_degree1 = sorted(sorted_degree1)

        central_dict = {}
        unadjusted_dict = {}
        global importance
        importance = {}

        for key, value in sorted_betweeness:
            central_dict[key] = []
            importance[key] = 0
            central_dict[key].append(value)

        for key, value in unadjusted_betweeness:
            unadjusted_dict[key] = []
            unadjusted_dict[key].append(value)

        for key, value in sorted_degree:
            central_dict[key].append(value)
            importance[key] += value

        for key, value in unadjusted_degree:
            unadjusted_dict[key].append(value)

        for key, value in sorted_eigenvector:
            central_dict[key].append(value)
            importance[key] + value

        for key, value in unadjusted_eigenvector:
            unadjusted_dict[key].append(value)

        for key, value in sorted_closeness:
            central_dict[key].append(value)
            importance[key] += value

        for key, value in unadjusted_closeness:
            unadjusted_dict[key].append(value)

        for key, value in sorted_harmonic:
            central_dict[key].append(value)
            importance[key] += value

        for key, value in unadjusted_harmonic:
            unadjusted_dict[key].append(value)

        for key, value in sorted_communicability:
            central_dict[key].append(value)
            importance[key] += value

        for key, value in unadjusted_communicability:
            unadjusted_dict[key].append(value)

        for key, value in sorted_core:
            importance[key] += value

        for key, value in sorted_degree1:
            importance[key] += value

        averages = {}
        acc = {}
        totals = {}
        groups = []
        for key, value in sorted_partition:
            val1 = booleans[key]
            if val1 not in groups:
                groups.append(val1)
                acc[val1] = 0
                totals[val1] = 0

        for key, value in importance.items():
            val1 = booleans[key]
            for item in groups:
                if (val1 == item):
                    acc[item] += value
                    totals[item] += 1

        for key, value in acc.items():
            averages[key] = acc[key] / totals[key]

        for key, value in sorted_partition:
            i += 1
            val1 = booleans[key]
            info += "{\"id\": \"" + str(key) + "\", \"group\": " + str(
                value) + ", \"question\": \"" + str(val1) + "\", "
            val = unadjusted_dict[key]

            info += "\"Centrality\": { \"Betweeness\": " + str(
                val[0]) + ", \"Degree\": " + str(
                    val[1]) + ", \"Eigenvector\": " + str(
                        val[2]) + ", \"Closeness\": " + str(
                            val[3]) + ", \"Harmonic\": " + str(
                                val[4]) + ", \"Communicability\": " + str(
                                    val[5]) + " } "
            if num == i:
                info += "} \n"
            else:
                info += "}, \n"
        info += "],\n \"links\":[\n"
        num = len(network)
        i = 0
        partition_dict = dict(sorted_partition)
        for key, value in sorted(network.items()):
            i += 1
            term = str(key).split(",")
            group = partition_dict[term[0]]
            if num == i:
                info += "{\"source\": \"" + term[
                    0] + "\", \"target\": \"" + term[
                        1] + "\", \"value\": \"" + str(
                            value) + "\", \"group\": " + str(group) + "}\n"
            else:
                info += "{\"source\": \"" + term[
                    0] + "\", \"target\": \"" + term[
                        1] + "\", \"value\": \"" + str(
                            value) + "\", \"group\": " + str(group) + "},\n"
        info += "],\n \"Unadjusted_centrality\":[\n"
        i = 0
        num = len(unadjusted_dict)
        for key, value in unadjusted_dict.items():
            i += 1
            val = value
            info += "{\"id\": \"" + str(
                key) + "\",\"Centrality\": { \"Betweeness\": " + str(
                    val[0]) + ", \"Degree\": " + str(
                        val[1]) + ", \"Eigenvector\": " + str(
                            val[2]) + ", \"Closeness\": " + str(
                                val[3]) + ", \"Harmonic\": " + str(
                                    val[4]) + " } "
            if num == i:
                info += "} \n"
            else:
                info += "}, \n"
        num = len(importance)
        i = 0
        info += "],\n \"importance\":[\n"
        for key, value in importance.items():
            i += 1
            val1 = booleans[key]
            info += "{\"id\": \"" + str(key) + "\", \"importance\": " + str(
                value) + ", \"question\": \"" + str(val1) + "\""
            if num == i:
                info += "} \n"
            else:
                info += "}, \n"
        num = len(acc)
        i = 0
        info += "],\n \"group_stats\":[\n"
        for key, value in acc.items():
            i += 1
            val2 = averages[key]
            val3 = totals[key]
            info += "{\"group\": \"" + str(
                key) + "\", \"total_importance\": " + str(
                    value) + ", \"average_importance\": " + str(
                        val2) + ", \"total_mentions\": " + str(val3)
            if num == i:
                info += "} \n"
            else:
                info += "}, \n"
        num = len(pageRank)
        i = 0
        info += "],\n \"pagerank\":[\n"
        for key, value in pageRank.items():
            i += 1
            info += "{\"id\": \"" + str(key) + "\", \"pagerank\": " + str(
                value)
            if num == i:
                info += "} \n"
            else:
                info += "}, \n"
                num = len(booleans)
        i = 0
        question = headers[2]
        info += "],\n \"question\":[\n"
        for key, value in booleans.items():
            i += 1
            info += "{\"id\": \"" + str(key) + "\", \"question\": \"" + str(
                value) + "\""
            if num == i:
                info += "} \n"
            else:
                info += "}, \n"
        info += "]\n},"
def createGraphFeatures(num_documents, clean_train_documents, unique_words,
                        sliding_window, b, idf_par, centrality_par,
                        centrality_col_par, normalized_centrality):
    features = np.zeros((num_documents, len(unique_words)))
    term_num_docs = {}

    print "Creating the graph of words for collection..."

    if centrality_col_par == "pagerank_centrality" or centrality_col_par == "out_degree_centrality" or centrality_col_par == "in_degree_centrality" or centrality_col_par == "betweenness_centrality_directed" or centrality_col_par == "closeness_centrality_directed":
        dGcol = nx.DiGraph()
    else:
        dGcol = nx.Graph()

    totalLen = 0
    for i in range(0, num_documents):
        #dG = nx.Graph()
        found_unique_words = []
        wordList1 = clean_train_documents[i].split(None)
        wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1]

        docLen = len(wordList2)
        totalLen += docLen

        # print clean_train_documents[i]

        for k, word in enumerate(wordList2):

            if word not in found_unique_words:
                found_unique_words.append(word)
                if word not in term_num_docs:
                    term_num_docs[word] = 1
                else:
                    term_num_docs[word] += 1

            for j in xrange(1, sliding_window):
                try:
                    next_word = wordList2[k + j]

                    if not dGcol.has_node(word):
                        dGcol.add_node(word)
                        dGcol.node[word]['count'] = 1

                    else:
                        dGcol.node[word]['count'] += 1

                    if not dGcol.has_node(next_word):
                        dGcol.add_node(next_word)
                        dGcol.node[next_word]['count'] = 0

                    if not dGcol.has_edge(word, next_word):
                        dGcol.add_edge(word, next_word, weight=1)
                    else:
                        dGcol.edge[word][next_word]['weight'] += 1
                except IndexError:
                    if not dGcol.has_node(word):
                        dGcol.add_node(word)
                        dGcol.node[word]['count'] = 1
                    else:
                        dGcol.node[word]['count'] += 1
                except:
                    raise

    avgLen = float(totalLen) / num_documents
    print "Number of nodes in collection graph:" + str(dGcol.number_of_nodes())
    print "Number of edges in collection graph:" + str(dGcol.number_of_edges())

    print "Average document length:" + str(avgLen)
    print "Number of self-loops for collection graph:" + str(
        dGcol.number_of_selfloops())

    if idf_par == "icw":
        icw_col = {}
        dGcol.remove_edges_from(dGcol.selfloop_edges())

        nx.write_edgelist(dGcol, "test.edgelist")

        if centrality_col_par == "degree_centrality":
            centrality_col = nx.degree_centrality(dGcol)
        elif centrality_col_par == "pagerank_centrality":
            centrality_col = pg.pagerank(dGcol)
            # centrality_col = nx.pagerank(dGcol)
        elif centrality_col_par == "eigenvector_centrality":
            centrality_col = nx.eigenvector_centrality(dGcol,
                                                       max_iter=10000,
                                                       weight="weight")
        elif centrality_col_par == "katz_centrality":
            centrality_col = nx.katz_centrality(dGcol)
        elif centrality_col_par == "betweenness_centrality" or centrality_col_par == "betweenness_centrality_directed":
            centrality_col = nx.betweenness_centrality(dGcol)
        elif centrality_col_par == "triangles":
            centrality_col = nx.triangles(dGcol)
        elif centrality_col_par == "clustering_coefficient":
            centrality_col = nx.clustering(dGcol)
        elif centrality_col_par == "in_degree_centrality":
            centrality_col = nx.in_degree_centrality(dGcol)
        elif centrality_col_par == "out_degree_centrality":
            centrality_col = nx.out_degree_centrality(dGcol)
        elif centrality_col_par == "core_number":
            centrality_col = nx.core_number(dGcol)
        elif centrality_col_par == "closeness_centrality" or centrality_col_par == "closeness_centrality_directed":
            centrality_col = nx.closeness_centrality(dGcol, normalized=False)
        elif centrality_col_par == "communicability_centrality":
            centrality_col = nx.communicability_centrality(dGcol)

        centr_sum = sum(centrality_col.values())
        for k, g in enumerate(dGcol.nodes()):
            if centrality_col[g] > 0:
                icw_col[g] = math.log10(
                    (float(centr_sum)) / (centrality_col[g]))
            else:
                icw_col[g] = 0

    idf_col = {}
    for x in term_num_docs:
        idf_col[x] = math.log10(
            (float(num_documents) + 1.0) / (term_num_docs[x]))

    print "Creating the graph of words for each document..."
    totalNodes = 0
    totalEdges = 0
    for i in range(0, num_documents):

        if centrality_par == "pagerank_centrality" or centrality_par == "out_degree_centrality" or centrality_par == "in_degree_centrality" or centrality_par == "betweenness_centrality_directed" or centrality_par == "closeness_centrality_directed":
            dG = nx.DiGraph()
        else:
            dG = nx.Graph()

        wordList1 = clean_train_documents[i].split(None)
        wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1]
        docLen = len(wordList2)

        if docLen == 2:
            print wordList2
        if docLen > 1 and wordList2[0] != wordList2[1]:
            # print clean_train_documents[i]
            for k, word in enumerate(wordList2):
                for j in xrange(1, sliding_window):
                    try:
                        next_word = wordList2[k + j]
                        if not dG.has_node(word):
                            dG.add_node(word)
                            dG.node[word]['count'] = 1
                        else:
                            dG.node[word]['count'] += 1
                        if not dG.has_node(next_word):
                            dG.add_node(next_word)
                            dG.node[next_word]['count'] = 0

                        if not dG.has_edge(word, next_word):
                            dG.add_edge(word, next_word, weight=1)
                        else:
                            dG.edge[word][next_word]['weight'] += 1
                    except IndexError:
                        if not dG.has_node(word):
                            dG.add_node(word)
                            dG.node[word]['count'] = 1
                        else:
                            dG.node[word]['count'] += 1
                    except:
                        raise

            dG.remove_edges_from(dG.selfloop_edges())

            if centrality_par == "degree_centrality":
                if normalized_centrality == True:
                    centrality = nx.degree_centrality(dG)
                elif normalized_centrality == False:
                    centrality = degree_centrality(dG)
            elif centrality_par == "clustering_coefficient":
                centrality = nx.clustering(dG)
            elif centrality_par == "pagerank_centrality":
                # centrality = pg.pagerank(dG,max_iter=10000)
                centrality = nx.pagerank(dG)
            elif centrality_par == "eigenvector_centrality":
                centrality = nx.eigenvector_centrality(dG, max_iter=10000)
            elif centrality_par == "katz_centrality":
                centrality = nx.katz_centrality(dG, normalized=False)
            elif centrality_par == "betweenness_centrality" or centrality_par == "betweenness_centrality_directed":
                centrality = nx.betweenness_centrality(dG, normalized=False)
            elif centrality_par == "triangles":
                centrality = nx.triangles(dG)
            elif centrality_par == "in_degree_centrality":
                if normalized_centrality == True:
                    centrality = nx.in_degree_centrality(dG)
                elif normalized_centrality == False:
                    centrality = in_degree_centrality(dG)

            elif centrality_par == "out_degree_centrality":
                if normalized_centrality == True:
                    centrality = nx.out_degree_centrality(dG)
                elif normalized_centrality == False:
                    centrality = out_degree_centrality(dG)
            elif centrality_par == "core_number":
                centrality = nx.core_number(dG)
            elif centrality_par == "weighted_centrality":
                if normalized_centrality == True:
                    centrality = weighted_centrality_normalized(dG)
                elif normalized_centrality == False:
                    centrality = weighted_centrality(dG)
            elif centrality_par == "closeness_centrality" or centrality_par == "closeness_centrality_directed":
                centrality = nx.closeness_centrality(dG, normalized=False)
            elif centrality_par == "communicability_centrality":
                centrality = nx.communicability_centrality(dG)

            totalNodes += dG.number_of_nodes()
            totalEdges += dG.number_of_edges()
            #print "Number of self-loops:"+str(dG.number_of_selfloops())
            #centrality = nx.out_degree_centrality(dG)
            #centrality = nx.katz_centrality(dG,max_iter=10000)

            for k, g in enumerate(dG.nodes()):
                # Degree centrality (local feature)
                if g in unique_words:
                    #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g]
                    if idf_par == "no" or idf_par == "yes":
                        features[i, unique_words.index(g)] = centrality[
                            g]  #centrality[g]/(1-b+(b*(float(docLen)/avgLen)))dG.node[g]['count']
                    elif idf_par == "idf":
                        features[i, unique_words.index(
                            g
                        )] = centrality[g] * idf_col[
                            g]  #(centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g]
                    elif idf_par == "icw":
                        features[i, unique_words.index(g)] = (
                            centrality[g] /
                            (1 - b + (b *
                                      (float(docLen) / avgLen)))) * icw_col[g]

    print "Average number of nodes:" + str(float(totalNodes) / num_documents)
    print "Average number of edges:" + str(float(totalEdges) / num_documents)
    return features
import networkx as nx
import numpy
import sys

import scipy.linalg
G=nx.read_adjlist("sfba-inipy.txt",delimiter=" ",nodetype=int)
#G=nx.read_adjlist("test.gpfc",delimiter=" ",nodetype=int)
sc = nx.communicability_centrality(G)
print sc
sys.exit (0)

# alternative implementation that calculates the matrix exponential
    
nodelist = G.nodes() # ordering of nodes in matrix
A = nx.to_numpy_matrix(G,nodelist)
# convert to 0-1 matrix
A[A!=0.0] = 1

print "A matrix"
print A
print

expA = scipy.linalg.expm(A)

print expA
print 

# convert diagonal to dictionary keyed by node
sc = dict(zip(nodelist,map(float,expA.diagonal())))
print  sc
Esempio n. 29
0
def compute_network_features(graph, network_name):
    # this function is used to calculate network features and returns result
    # as a dictionary: Dict<feature_name,feature_value>
    # --------------------------------------------------------------------------------

    network_features = dict()
    node_features_list = list()

    netclass = network_name.split('___')

    if len(netclass) > 1:
        network_features['group'] = netclass[0]
        network_features['Network Name'] = netclass[1]
    else:
        network_features['group'] = '_unknown_'
        network_features['Network Name'] = network_name

    if graph.is_directed():
        network_features['Is Directed?'] = True
    else:
        network_features['Is Directed?'] = False

    if graph.is_multigraph():
        network_features['Is MultiGraph?'] = True
    else:
        network_features['Is MultiGraph?'] = False

    # Global Attributes
    # --------------------------------------------------------------------------------
    # number of nodes
    # --------------------------------------------------------------------------------
    if _nn:
        try:
            nodes_count = nx.number_of_nodes(graph)
            network_features['Number of Nodes'] = nodes_count
        except:
            network_features['Number of Nodes'] = 'NA'

    # number of edges
    # --------------------------------------------------------------------------------
    if _ne:
        try:
            edges_count = nx.number_of_edges(graph)
            network_features['Number of Edges'] = edges_count
        except:
            network_features['Number of Edges'] = 'NA'

    # network density
    # --------------------------------------------------------------------------------
    if _dens:
        try:
            density = nx.density(graph)
            network_features['Density'] = density
        except:
            network_features['Density'] = 'NA'

    # graph degree assortativity
    # --------------------------------------------------------------------------------
    if _dac:
        try:
            graph_degree_assortativity = nx.degree_assortativity_coefficient(
                graph)
            network_features[
                'Graph Degree Assortativity'] = graph_degree_assortativity
        except:
            network_features['Graph Degree Assortativity'] = 'NA'

    # avg. closeness centrality
    # --------------------------------------------------------------------------------
    if _acc:
        try:
            ccn = nx.closeness_centrality(graph)
            mccn = np.mean(ccn.values())
            network_features['Avg. Closeness Centrality'] = mccn
        except:
            network_features['Avg. Closeness Centrality'] = 'NA'

    # avg. betweenness centrality
    # --------------------------------------------------------------------------------
    if _abc:
        try:
            bcn = nx.betweenness_centrality(graph)
            mbcn = np.mean(bcn.values())
            network_features['Avg. Betweenness Centrality'] = mbcn
        except:
            network_features['Avg. Betweenness Centrality'] = 'NA'

    # avg. degree centrality
    # --------------------------------------------------------------------------------
    if _adc:
        try:
            dcn = nx.degree_centrality(graph)
            mdcn = np.mean(dcn.values())
            network_features['Avg. Degree Centrality'] = mdcn
        except:
            network_features['Avg. Degree Centrality'] = 'NA'

    # avg. degree connectivity
    # --------------------------------------------------------------------------------
    if _adcon:
        try:
            dc = nx.average_degree_connectivity(graph)
            adc = np.mean(dc.values())
            network_features['Avg. Degree Connectivity'] = adc
        except:
            network_features['Avg. Degree Connectivity'] = 'NA'

    # avg. load centrality
    # --------------------------------------------------------------------------------
    if _alc:
        try:
            lc = nx.load_centrality(graph)
            mlc = np.mean(lc.values())
            network_features['Avg. Load Centrality'] = mlc
        except:
            network_features['Avg. Load Centrality'] = 'NA'

    # avg. edge betweenness centrality
    # --------------------------------------------------------------------------------

    # try:
    #     ebc = nx.edge_betweenness_centrality(graph)
    #     mebc = np.mean(ebc.values())
    #     network_features['Avg. Edge Betweenness centrality'] = mebc
    # except:
    #     network_features['Avg. Edge Betweenness centrality'] = 'NA'

    # edge connectivity
    # --------------------------------------------------------------------------------
    # try:
    #     ec = nx.edge_connectivity(graph)
    #     network_features['Edge Connectivity'] = ec
    # except:
    #     network_features['Edge Connectivity'] = 'NA'

    # diameter
    # --------------------------------------------------------------------------------
    if _nd:
        try:
            diameter = nx.diameter(graph)
            network_features['Diameter'] = diameter
        except:
            network_features['Diameter'] = 'NA'

    # eccentricity
    # --------------------------------------------------------------------------------
    if _ae:
        try:
            eccentricity = nx.eccentricity(graph)
            network_features['Avg. Eccentricity'] = np.mean(
                eccentricity.values())
        except:
            network_features['Eccentricity'] = 'NA'

    # radius
    # --------------------------------------------------------------------------------
    if _rad:
        try:
            radius = nx.radius(graph)
            network_features['Radius'] = radius
        except:
            network_features['Radius'] = 'NA'

    # Non MultiGraph Features
    # --------------------------------------------------------------------------------
    if not graph.is_multigraph():

        # transitivity
        # ----------------------------------------------------------------------------
        if _trans:
            try:
                transitivity = nx.transitivity(graph)
                network_features['Transitivity'] = transitivity
            except:
                network_features['Transitivity'] = 'NA'

        # Katz centrality
        # ----------------------------------------------------------------------------
        if _akc:
            try:
                katz = nx.katz_centrality(graph)
                mean_katz = np.mean(katz.values())
                network_features['Avg. Katz Centrality'] = mean_katz
            except:
                network_features['Avg. Katz Centrality'] = 'NA'

        # PageRank
        # ----------------------------------------------------------------------------
        if _ap:
            try:
                pagerank = nx.pagerank(graph)
                mean_pagerank = np.mean(pagerank.values())
                network_features['Avg. PageRank'] = mean_pagerank
            except:
                network_features['Avg. PageRank'] = 'NA'

    # Undirected Graphs
    # --------------------------------------------------------------------------------

    if not nx.is_directed(graph):

        # Degree
        # ----------------------------------------------------------------------------
        #
        # try:
        #     all_degrees = nx.degree(graph)
        #     mean_degrees = np.mean(all_degrees.values())
        #     network_features['Avg. Degree'] = mean_degrees
        # except:
        #     network_features['Avg. Degree'] = 'NA'

        # connected components
        # ----------------------------------------------------------------------------
        if _nocc:
            try:
                cc_number = nx.number_connected_components(graph)
                network_features['Number of Connected Components'] = cc_number
            except:
                network_features['Number of Connected Components'] = 'NA'

        # lcc size fraction && avg. cc size
        # ----------------------------------------------------------------------------
        if _accs or _lcc_size:
            try:
                cc_list = list(nx.connected_components(graph))
                cc_sizes = []
                for cc in cc_list:
                    cc_sizes.append(len(cc))

                lcc_size = np.max(cc_sizes)
                if _accs:
                    network_features['lcc_size_fraction'] = lcc_size / float(
                        nodes_count)
                if _lcc_size:
                    mean_cc_sizes = np.mean(cc_sizes)
                    network_features[
                        'Avg. Connected Component Size'] = mean_cc_sizes
            except:
                if _accs:
                    network_features['lcc_size_fraction'] = 'NA'
                if _lcc_size:
                    network_features['Avg. Connected Component Size'] = 'NA'

        # communicability centrality for Undirected networks
        # ----------------------------------------------------------------------------
        if not graph.is_multigraph():
            if _acoc:
                try:
                    cc = nx.communicability_centrality(graph)
                    mcc = np.mean(cc.values())
                    network_features['Avg. Communicability Centrality'] = mcc
                except:
                    network_features['Avg. Communicability Centrality'] = 'NA'

            # clustering coefficient
            # -------------------------------------------------------------------------
            if _ncc:
                try:
                    clustering_coefficient = nx.average_clustering(graph)
                    network_features[
                        'Network Clustering Coefficient'] = clustering_coefficient
                except:
                    network_features['Network Clustering Coefficient'] = 'NA'

        # clique analysis for Undirected networks
        # ----------------------------------------------------------------------------
        if _max_cs:
            try:
                cliques_obj = nx.find_cliques(graph)
                cliques = [clq for clq in cliques_obj]

                clique_sizes = []
                for c in cliques:
                    clique_sizes.append(len(c))

                # user_clique_size = 5
                if len(clique_sizes) > 0:
                    # network_features['No of Cliques with size ' + str(user_clique_size)] \
                    # = clique_sizes.count(user_clique_size)
                    network_features['Avg. Clique Size'] = np.mean(
                        clique_sizes)
                    network_features['Max Clique Size'] = np.max(clique_sizes)
                else:
                    # network_features['No of Cliques with size ' + str(user_clique_size)] = 0
                    network_features['Avg. Clique Size'] = 0
                    network_features['Max Clique Size'] = 0
            except:
                # network_features['No of Cliques with size ' + str(user_clique_size)] = 'NA'
                network_features['Avg. Clique Size'] = 'NA'
                network_features['Max Clique Size'] = 'NA'

                # else:
                # try:
                #     all_in_degrees = nx.DiGraph.in_degree(graph)
                #     all_out_degrees = nx.DiGraph.out_degree(graph)
                #
                #     mean_in_degrees = np.mean(all_in_degrees.values())
                #     mean_out_degrees = np.mean(all_out_degrees.values())
                #
                #     network_features['Avg. In Degree'] = mean_in_degrees
                #     network_features['Ave. Out Degree'] = mean_out_degrees
                # except:
                #     network_features['Avg. In Degree'] = 'NA'
                #     network_features['Ave. Out Degree'] = 'NA'

    # Nodes Features Calculation

    for node in graph.nodes():

        node_features = dict()

        try:
            node_features['group'] = network_name
        except:
            node_features['group'] = 'NA'

        if _abc:
            try:
                node_features['Betweenness Centrality'] = bcn[node]
            except:
                node_features['Betweenness Centrality'] = 'NA'

        if _acc:
            try:
                node_features['Closeness Centrality'] = ccn[node]
            except:
                node_features['Closeness Centrality'] = 'NA'

        if _adc:
            try:
                node_features['Degree Centrality'] = dcn[node]
            except:
                node_features['Degree Centrality'] = 'NA'

        if _alc:
            try:
                node_features['Load Centrality'] = lc[node]
            except:
                node_features['Load Centrality'] = 'NA'

        if _ae:
            try:
                node_features['Eccentricity'] = eccentricity[node]
            except:
                node_features['Eccentricity'] = 'NA'

        if not graph.is_multigraph():
            if _akc:
                try:
                    node_features['Katz Centrality'] = katz[node]
                except:
                    node_features['Katz Centrality'] = 'NA'

            if _ap:
                try:
                    node_features['PageRank'] = pagerank[node]
                except:
                    node_features['PageRank'] = 'NA'

        if not nx.is_directed(graph):
            # try:
            #     node_features['Degree'] = all_degrees[node]
            # except:
            #     node_features['Degree'] = 'NA'

            if not graph.is_multigraph():
                if _acoc:
                    try:
                        node_features['Communicability Centrality'] = cc[node]
                    except:
                        node_features['Communicability Centrality'] = 'NA'
                        # else:
                        # try:
                        #     node_features['In Degree'] = all_in_degrees[node]
                        # except:
                        #     node_features['In Degree'] = 'NA'
                        #
                        # try:
                        #     node_features['Out Degree'] = all_out_degrees[node]
                        # except:
                        #     node_features['Out Degree'] = 'NA'

        node_features_list.append(node_features)

    return network_features, node_features_list
Esempio n. 30
0
    def centrality(self):
        result = {}
        result['degree_centrality'] = nx.degree_centrality(self.graph)

        if self.directed == 'directed':
            result['in_degree_centrality'] = nx.in_degree_centrality(
                self.graph)
            result['out_degree_centrality'] = nx.out_degree_centrality(
                self.graph)

        result['closeness_centrality'] = nx.closeness_centrality(self.graph)
        result['betweenness_centrality'] = nx.betweenness_centrality(
            self.graph)

        # fix the tuple cant decode into json problem
        stringify_temp = {}
        temp = nx.edge_betweenness_centrality(self.graph)
        for key in temp.keys():
            stringify_temp[str(key)] = temp[key]
        result['edge_betweenness_centrality'] = stringify_temp

        if self.directed == 'undirected':
            result[
                'current_flow_closeness_centrality'] = nx.current_flow_closeness_centrality(
                    self.graph)
            result[
                'current_flow_betweenness_centrality'] = nx.current_flow_betweenness_centrality(
                    self.graph)

            stringify_temp = {}
            temp = nx.edge_current_flow_betweenness_centrality(self.graph)
            for key in temp.keys():
                stringify_temp[str(key)] = temp[key]
            result['edge_current_flow_betweenness_centrality'] = stringify_temp

            result[
                'approximate_current_flow_betweenness_centrality'] = nx.approximate_current_flow_betweenness_centrality(
                    self.graph)
            result['eigenvector_centrality'] = nx.eigenvector_centrality(
                self.graph)
            result[
                'eigenvector_centrality_numpy'] = nx.eigenvector_centrality_numpy(
                    self.graph)
            result['katz_centrality'] = nx.katz_centrality(self.graph)
            result['katz_centrality_numpy'] = nx.katz_centrality_numpy(
                self.graph)
            result['communicability'] = nx.communicability(self.graph)
            result['communicability_exp'] = nx.communicability_exp(self.graph)
            result[
                'communicability_centrality'] = nx.communicability_centrality(
                    self.graph)
            result[
                'communicability_centrality_exp'] = nx.communicability_centrality_exp(
                    self.graph)
            result[
                'communicability_betweenness_centrality'] = nx.communicability_betweenness_centrality(
                    self.graph)
            result['estrada_index'] = nx.estrada_index(self.graph)

        result['load_centrality'] = nx.load_centrality(self.graph)

        stringify_temp = {}
        temp = nx.edge_load(self.graph)
        for key in temp.keys():
            stringify_temp[str(key)] = temp[key]
        result['edge_load'] = stringify_temp
        result['dispersion'] = nx.dispersion(self.graph)

        fname_centra = self.DIR + '/centrality.json'
        with open(fname_centra, "w") as f:
            json.dump(result, f, cls=SetEncoder, indent=2)
        print(fname_centra)
Esempio n. 31
0
 def communicability_centrality_exp(self):
     self.communicability_centrality_exp_dict = nx.communicability_centrality(self.G)
Esempio n. 32
0
plt.plot(fraction_of_nodes, size_max_component, label='Grado', linewidth=2)

# --------------- Remuevo por subgraph centrality -------------- #

size_max_component = np.zeros(nodes_to_remove, dtype=float)
graph_aux = deepcopy(graph)
graph_aux_nx = deepcopy(graph_nx)

for j in range(nodes_to_remove):

    # Calculo y guardo el size de la componente mas grande
    graph_aux2 = graph_aux.clusters()
    size_max_component[j] +=  float(max(graph_aux2.sizes())) \
                                  / size_of_large_connected_component

    criteria = nx.communicability_centrality(graph_aux_nx).items()
    criteria.sort(reverse=True, key=lambda item: item[1])
    # Tomo el primer elemento, de mayor subgraph centrality
    vertex_ind = criteria[0][0]

    # Remuevo el vertice
    graph_aux.delete_vertices(vertex_ind)
    graph_aux_nx.remove_node(vertex_ind)

plt.figure(1)
plt.plot(fraction_of_nodes,
         size_max_component,
         label='SubGraph (iter)',
         linewidth=2)

# --------- Lo hago todo de una ----------- #
Esempio n. 33
0
                     g5.add_edge(fid, partner, form=int(form), aggform=int(aggform), consolform=int(consolform))  
     
 
     d=nx.degree(mg)
     nx.set_node_attributes(mg,'d',d)
     dc=nx.degree_centrality(mg)
     nx.set_node_attributes(mg,'dc',dc)
     ec=nx.eigenvector_centrality(g, 10000)
     nx.set_node_attributes(g,'ec',ec)     
     bc=nx.betweenness_centrality(mg)
     nx.set_node_attributes(mg,'bc',bc)
     cc=nx.closeness_centrality(mg)
     nx.set_node_attributes(mg,'cc',cc)
     cl=nx.clustering(g)
     nx.set_node_attributes(g,'cl',cl)
     co=nx.communicability_centrality(g)
     nx.set_node_attributes(g,'co',co)
                
     d=nx.degree(g1)
     nx.set_node_attributes(g1,'d',d)
     d=nx.degree(g2)
     nx.set_node_attributes(g2,'d',d)
     d=nx.degree(g3)
     nx.set_node_attributes(g3,'d',d)
     d=nx.degree(g4)
     nx.set_node_attributes(g4,'d',d)
     d=nx.degree(g5)
     nx.set_node_attributes(g5,'d',d)
     
     #projected eigenvector centrality
     bio_nodes = set(n for n in g.nodes() if n < 1000 and n > 0)
Esempio n. 34
0
def createGraphFeatures(num_documents,clean_train_documents,unique_words,bigrams,sliding_window,b,idf_par,centrality_par,centrality_col_par,train_par,idf_learned,icw_learned,kcore_par,dGcol_nodes,max_core_col,kcore_par_int,max_core_feat,feature_reduction,avgLen):
    features = np.zeros((num_documents,len(unique_words)))
    unique_words_len = len(unique_words)
    term_num_docs = {}

    print "sliding_window:"+str(sliding_window)
    if train_par:
        print "Training set..."
        idfs = {}
        dGcol_nodes = {}
        icws = {}
        max_core_feat = []

        print "Creating the graph of words for collection..."

        if centrality_col_par=="pagerank_centrality" or centrality_col_par=="in_degree_centrality" or centrality_col_par=="out_degree_centrality" or centrality_col_par=="closeness_centrality_directed" or centrality_col_par=="betweenness_centrality_directed":
            dGcol = nx.DiGraph()
        else:
            dGcol = nx.Graph()
        
        totalLen = 0
        for i in range( 0,num_documents ):
            #dG = nx.Graph()
            found_unique_words = []
            wordList1 = clean_train_documents[i].split(None)
            wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1]

            docLen = len(wordList2)
            totalLen += docLen

            # print clean_train_documents[i]
            if len(wordList2)>1:
                for k, word in enumerate(wordList2):

                    if word not in found_unique_words:
                        found_unique_words.append(word)
                        if word not in term_num_docs:
                            term_num_docs[word] = 1
                        else:
                            term_num_docs[word] += 1

                    for j in xrange(1,sliding_window):
                        try:
                            next_word = wordList2[k + j]
                            # print word+"\t"+next_word
                            # time.sleep(2)
                            if not dGcol.has_node(word):
                                dGcol.add_node(word)
                                dGcol.node[word]['count'] = 1
                                
                            else:
                                dGcol.node[word]['count'] += 1
                                
                            if not dGcol.has_node(next_word):
                                dGcol.add_node(next_word)
                                dGcol.node[next_word]['count'] = 0

                            if not dGcol.has_edge(word, next_word):
                                dGcol.add_edge(word, next_word, weight = 1)
                            else:
                                dGcol.edge[word][next_word]['weight'] += 1
                        except IndexError:
                            if not dGcol.has_node(word):
                                dGcol.add_node(word)
                                dGcol.node[word]['count'] = 1
                            else:
                                dGcol.node[word]['count'] += 1
                        except:
                            raise

        print "Number of self-loops for collection graph:"+str(dGcol.number_of_selfloops())
        dGcol.remove_edges_from(dGcol.selfloop_edges())
        collection_count_nodes = dGcol.number_of_nodes()
        collection_count_edges = dGcol.number_of_edges()
        print "Number of nodes in collection graph:"+str(collection_count_nodes)
        print "Number of edges in collection graph:"+str(collection_count_edges)
        avgLen = float(totalLen)/num_documents
        print "Average document length:"+str(avgLen)
        
   
        if idf_par=="icw" or idf_par=="icw+idf" or idf_par=="tf-icw":
            icw_col = {}

            if(kcore_par=="A1" or kcore_par=="A2"):
                collection_core = nx.core_number(dGcol)
                max_core = max(collection_core.values())
                print "Max core of collection:"+str(max_core)
                # core_Size_Distribution(collection_core)
                for k,g in enumerate(dGcol.nodes()):
                    if kcore_par=="A1":
                        # A1 method: remove features and then rank
                        for x in range(0,kcore_par_int):
                            if collection_core[g]==max_core-x:
                                dGcol.remove_node(g)
                    else:
                        # A2 method: rank first and then remove features
                        for x in range(0,kcore_par_int):
                            if collection_core[g]==max_core-x:
                                max_core_col.append(g)


            if centrality_col_par == "degree_centrality":
                centrality_col = nx.degree_centrality(dGcol)
            elif centrality_col_par=="in_degree_centrality":
                centrality_col = nx.in_degree_centrality(dGcol)
            elif centrality_col_par=="out_degree_centrality":
                centrality_col = nx.out_degree_centrality(dGcol)
            elif centrality_col_par == "pagerank_centrality":
                # centrality_col = pg.pagerank(dGcol,max_iter=1000)
                centrality_col = nx.pagerank(dGcol)
            elif centrality_col_par == "eigenvector_centrality":
                centrality_col = nx.eigenvector_centrality(dGcol,max_iter=1000)
            elif centrality_col_par == "betweenness_centrality" or centrality_col_par=="betweenness_centrality_directed":
                centrality_col = nx.betweenness_centrality(dGcol)
            elif centrality_col_par == "triangles":
                centrality_col = nx.triangles(dGcol)
            elif centrality_col_par == "clustering_coefficient":
                centrality_col = nx.clustering(dGcol)
            elif centrality_col_par == "core_number":
                centrality_col = nx.core_number(dGcol)
            elif centrality_col_par == "closeness_centrality" or centrality_col_par=="closeness_centrality_directed":
                centrality_col = nx.closeness_centrality(dGcol)
            elif centrality_col_par == "closeness_centrality_weighted":
                centrality_col = nx.closeness_centrality(dGcol)
            elif centrality_col_par == "communicability_centrality":
                centrality_col = nx.communicability_centrality(dGcol)

            centr_sum = sum(centrality_col.values())
            for k,g in enumerate(dGcol.nodes()):
                if centrality_col[g]!=0:
                    if idf_par=="icw" or idf_par=="tf-icw" or idf_par=="icw+idf":
                        icw_col[g] = math.log10(float(centr_sum)/centrality_col[g])
                else:
                    icw_col[g] = 0

        # elif idf_par=="idf":
        idf_col = {}
        for x in term_num_docs:
            if idf_par=="idf":
                idf_col[x] = math.log10((float(num_documents)+1.0) / term_num_docs[x])
            elif idf_par=="icw+idf":
                idf_col[x] = math.log10((float(num_documents)+1.0) / term_num_docs[x])

        dGcol_nodes = dGcol.nodes()

    # for the testing set
    else:

        if idf_par=="idf":
            idf_col = idf_learned
        elif idf_par=="icw" or idf_par=="tf-icw":
            icw_col = icw_learned
        elif idf_par=="icw+idf":
            idf_col = idf_learned
            icw_col = icw_learned

        collection_count_nodes = 0
        collection_count_edges = 0

    # nx.write_edgelist(dGcol,"test.edgelist",data=True,delimiter="\t")

    print "Creating the graph of words for each document..."
    totalNodes = 0
    totalEdges = 0

    corrs_per_category = [[] for i in range(4)]

    for i in range( 0,num_documents ):

        if centrality_par=="pagerank_centrality" or centrality_par=="in_degree_centrality" or centrality_par=="out_degree_centrality" or centrality_par=="closeness_centrality_directed" or centrality_par=="betweenness_centrality_directed":
            dG = nx.DiGraph()
        else:
            dG = nx.Graph()

        wordList1 = clean_train_documents[i].split(None)
        wordList2 = [string.rstrip(x.lower(), ',.!?;') for x in wordList1]
        docLen = len(wordList2)

        if len(wordList2)>1:
            for k, word in enumerate(wordList2):
                for j in xrange(1,sliding_window):
                    try:
                        next_word = wordList2[k + j]
                        
                        if not dG.has_node(word):
                            dG.add_node(word)
                            dG.node[word]['count'] = 1
                        else:
                            dG.node[word]['count'] += 1

                        if not dG.has_node(next_word):
                            dG.add_node(next_word)
                            dG.node[next_word]['count'] = 1

                        if not dG.has_edge(word, next_word):
                            dG.add_edge(word, next_word, weight = 1)
                        else:
                            dG.edge[word][next_word]['weight'] += 1
                    except IndexError:
                        if not dG.has_node(word):
                            dG.add_node(word)
                            dG.node[word]['count'] = 1
                        else:
                            dG.node[word]['count'] += 1
                    except:
                        raise

            dG.remove_edges_from(dG.selfloop_edges())
            for node1, node2 in dG.edges_iter():
                dG.edge[node1][node2]['inv_weight'] = 1.0 / dG.edge[node1][node2]['weight']

            if train_par:
                if(kcore_par=="B1" or kcore_par=="B2"):
                    max_core_doc = []
                    document_core = nx.core_number(dG)
                    max_core = max(document_core.values())
                    # print "Max core of document:"+str(max_core)
                    # core_Size_Distribution(document_core)
                    for k,g in enumerate(dG.nodes()):
                        if kcore_par=="B1":
                            # B1 method: remove features and then rank
                            for x in range(0,kcore_par_int):
                                if document_core[g]==max_core-x:
                                    dG.remove_node(g)
                        else:
                            # B2 method: rank first and then remove features
                            for x in range(0,kcore_par_int):
                                if document_core[g]==max_core-x:
                                    max_core_doc.append(g)
                                    if g not in max_core_feat:
                                        max_core_feat.append(g)
            
            # centrality = nx.degree_centrality(dG)
            #centrality = nx.core_number(dG)
            if centrality_par == "degree_centrality":
                centrality = nx.degree_centrality(dG)
            elif centrality_par == "in_degree_centrality":
                centrality = nx.in_degree_centrality(dG)
            elif centrality_par == "out_degree_centrality":
                centrality = nx.out_degree_centrality(dG)
            elif centrality_par == "pagerank_centrality":
                # centrality = pg.pagerank(dG,max_iter=1000)
                centrality = nx.pagerank(dG)
            elif centrality_par =="betweenness_centrality" or centrality_par=="betweenness_centrality_directed":
                centrality = nx.betweenness_centrality(dG,weight="weight")
            elif centrality_par =="triangles":
                centrality = nx.triangles(dG)
            elif centrality_par =="eigenvector_centrality":
                centrality = nx.eigenvector_centrality_numpy(dG)
            elif centrality_par =="core_number":
                centrality = nx.core_number(dG)
            elif centrality_par =="clustering_coefficient":
                centrality = nx.clustering(dG)
            elif centrality_par == "closeness_centrality" or centrality_par=="closeness_centrality_directed":
                centrality = nx.closeness_centrality(dG)
            elif centrality_par == "closeness_centrality_weighted":
                centrality = nx.closeness_centrality(dG,distance='weight')
            elif centrality_par == "communicability_centrality":
                centrality = nx.communicability_centrality(dG)
            elif centrality_par == "closeness_centrality_not_normalized":
                centrality = nx.closeness_centrality(dG,normalized=False)
            elif centrality_par == "degree_centrality_weighted":
                centrality = weighted_degree_centrality(dG)
            #print "Number of self-loops:"+str(dG.number_of_selfloops())
            #centrality = nx.out_degree_centrality(dG)
            #centrality = pg.pagerank(dG,max_iter=1000)
            #centrality = nx.katz_centrality(dG,max_iter=10000)

            totalNodes += dG.number_of_nodes()
            totalEdges += dG.number_of_edges()

            tfs = []
            centralities = []
            centr_sum_doc = sum(centrality.values())

            for k, g in enumerate(dG.nodes()):
                if g in dGcol_nodes:
                    if kcore_par=="B2":
                        if g in max_core_feat:
                            # Degree centrality (local feature)
                            if g in unique_words:
                                #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g]
                                if idf_par=="no":
                                    features[i,unique_words.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen)))
                                elif idf_par=="idf":
                                    features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g]
                                    # features[i,unique_words.index(g)] = centrality[g] * idf_col[g]
                                elif idf_par=="icw":
                                    features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g]
                                    # features[i,unique_words.index(g)] = centrality[g] * icw_col[g]
                                elif idf_par=="icw+idf":
                                    features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g]
                                    # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g])

                            elif g in bigrams:
                                #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g]
                                if idf_par=="no":
                                    features[i,unique_words_len+bigrams.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen)))
                                elif idf_par=="idf":
                                    features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g]
                                    # features[i,unique_words.index(g)] = centrality[g] * idf_col[g]
                                elif idf_par=="icw":
                                    features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g]
                                    # features[i,unique_words.index(g)] = centrality[g] * icw_col[g]
                                elif idf_par=="icw+idf":
                                    features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g]
                                    # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g])
                    else:
                        if g in unique_words:
                            #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g]
                            if idf_par=="no":
                                features[i,unique_words.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen)))
                                tfs.append(wordList2.count(g))
                                centralities.append(centrality[g])
                            elif idf_par=="tf-icw":
                                tf_g = 1+math.log(1+math.log(wordList2.count(g)))
                                features[i,unique_words.index(g)] = (tf_g/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g]
                            elif idf_par=="idf":
                                features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g]
                                # features[i,unique_words.index(g)] = centrality[g] * idf_col[g]
                            elif idf_par=="icw":
                                features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g]
                                # features[i,unique_words.index(g)] = centrality[g] * icw_col[g]
                            elif idf_par=="icw+idf":
                                features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g]
                                # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g])

                        elif g in bigrams:
                            #features[i,unique_words.index(g)] = dG.degree(nbunch=g,weight='weight') * idf_col[g]
                            if idf_par=="no":
                                features[i,unique_words_len+bigrams.index(g)] = centrality[g]/(1-b+(b*(float(docLen)/avgLen)))
                            elif idf_par=="idf":
                                features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * idf_col[g]
                                # features[i,unique_words.index(g)] = centrality[g] * idf_col[g]
                            elif idf_par=="icw":
                                features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g]
                                # features[i,unique_words.index(g)] = centrality[g] * icw_col[g]
                            elif idf_par=="icw+idf":
                                features[i,unique_words_len+bigrams.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g]
                                # features[i,unique_words.index(g)] = centrality[g] * math.log10(icw_col[g] * idf_col[g])
    #     if train_par:
    #         # pears = pearsonr(tfs,centralities)

    #         ind_tfs = sorted(range(len(tfs)), key=lambda k: tfs[k])[-20:]
    #         ind_centr = sorted(range(len(centralities)), key=lambda k: centralities[k])[-20:]
    #         tau, p_value = kendalltau([unique_words[k] for k in ind_tfs],[unique_words[k] for k in ind_centr])
            
    #         corrs_per_category[int(y[i])-1].append(tau)
    
    # if train_par:

    #     text_file = open("kendal_tfs_tws_output_tw_idf_"+idf_par+"_centr_"+centrality_par+"_sliding_"+str(sliding_window)+"_kcore_"+kcore_par+".txt", "w")
        
    #     text_file.write(str(corrs_per_category))
    #     text_file.close()

    #     fig = plt.figure()
    #     ax = fig.add_subplot(111)

    #     ax.boxplot(corrs_per_category[:])

    #     plt.show()


    if idf_par=="no":
        idfs = {}
        icws = {}
    if idf_par=="idf":
        idfs = idf_col
        icws = {}
    elif idf_par=="icw" or idf_par=="tf-icw":
        idfs = {}
        icws = icw_col
    elif idf_par=="icw+idf":
        idfs = idf_col
        icws = icw_col

    if train_par:
        if kcore_par=="B2":
            feature_reduction = float(len(max_core_feat))/len(dGcol_nodes)
            print "Percentage of features kept:"+str(feature_reduction)
        print "Average number of nodes:"+str(float(totalNodes)/num_documents)
        print "Average number of edges:"+str(float(totalEdges)/num_documents)
    
    return features, idfs,icws,collection_count_nodes, collection_count_edges, dGcol_nodes,max_core_col,feature_reduction, max_core_feat,avgLen
Esempio n. 35
0
def createGraphFeatures(num_documents, clean_train_documents, unique_words, bigrams, sliding_window, b, idf_par,
                        centrality_par, centrality_col_par,
                        train_par, idf_learned, icw_learned, dGcol_nodes, avgLen, path, y_train):
    #features = np.zeros((num_documents,len(unique_words)))
    features = lil_matrix((num_documents,len(unique_words)))
    unique_words_len = len(unique_words)
    term_num_docs = {}
    if train_par:
        print("Training set...")
        if centrality_col_par=="weighted_degree_centrality" or centrality_col_par=="weighted_pagerank_centrality":
            tf_par = "word2vec"
            getOnlyDataWord2VecModel(clean_train_documents)
        else:
            tf_par = "word2ve"
        print("sliding_window:"+str(sliding_window))
        idfs = {}
        dGcol_nodes = {}
        icws = {}
        max_core_feat = []
        ## this is for the label graphs
        dGlabels = []
        totalLen = 0
        totalDiam = 0
        for label in list(set(y_train)):
            dGlabels.append(nx.Graph())
        # ## IDW
        # print("Creating the graph of documents (IDW).."
        # # getOnlyDataWord2VecModel(clean_train_documents)
        #
        # all_doc_nodes = []
        # for i in range( 0,num_documents ):
        #     all_doc_nodes.append(i)
        #
        # edges = combinations(all_doc_nodes, 2)
        # dGdocs = nx.Graph()
        #
        # vectorizer = TfidfVectorizer(min_df=1)
        # tf_idf_matrix = vectorizer.fit_transform(clean_train_documents)
        # for e in edges:
        #     # dGdocs.add_edge(e,weight=metrics.pairwise.cosine_similarity(w2v.wv.wmdistance(clean_train_documents[e[0]],clean_train_documents[e[1]])))
        #     vect = TfidfVectorizer(min_df=1)
        #     tfidf = vect.fit_transform([clean_train_documents[e[0]],clean_train_documents[e[1]]])
        #     dGdocs.add_edge(e[0],e[1],weight=tfidf[0,1])
        # t1 = time.time()
        # matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), len(clean_train_documents), 0)
        # t = time.time()-t1
        # print("SELFTIMED:"+str(t)
        #
        # # matches_df = get_matches_df(matches, clean_train_documents)
        # for e in edges:
        #     dGdocs.add_edge(e[0],e[1],weight=matches[e[0],e[1]])
        # del matches
        if not os.path.exists(path+"_collection_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist"):
            print("Creating the graph of words for collection...")
            if centrality_col_par=="pagerank_centrality" or centrality_col_par=="in_degree_centrality" or \
                centrality_col_par=="out_degree_centrality" or centrality_col_par=="closeness_centrality_directed" or \
                centrality_col_par=="betweenness_centrality_directed" or centrality_col_par=="weighted_pagerank_centrality":
                dGcol = nx.DiGraph()
            else:
                dGcol = nx.Graph()
            totalLen = 0
            totalDiam = 0
            for i in range(num_documents ):
                # dG = nx.Graph()
                graphVizualizeFlag=False
                if graphVizualizeFlag:
                    if i is not 0 and (i%25==0):
                        print(i)
                        print("dGcol.number_of_nodes()", dGcol.number_of_nodes())
                        st = time.time()
                        save_graph(dGcol, "graph_"+str(i)+".pdf")
                        fi = time.time() - st
                        print('time:', fi)
                        print('=================================')
                lg = int(y_train[i])
                found_unique_words = []
                wordList1 = clean_train_documents[i].split(None)
                wordList2 = [x.rstrip(',.!?;') for x in wordList1]
                docLen = len(wordList2)
                # print(clean_train_documents[i]
                #if len(wordList2)>1:
                totalLen += docLen
                for k, word in enumerate(wordList2):
                    if word not in found_unique_words:
                        found_unique_words.append(word)
                        if word not in term_num_docs:
                            term_num_docs[word] = 1
                        else:
                            term_num_docs[word] += 1
                    for j in range(1, sliding_window):
                        try:
                            next_word = wordList2[k + j]
                            # print(word+"\t"+next_word
                            # time.sleep(2)
                            if not dGcol.has_node(word):
                                dGcol.add_node(word)
                                dGcol.node[word]['count'] = 1
                            else:
                                dGcol.node[word]['count'] += 1
                            if not dGcol.has_node(next_word):
                                dGcol.add_node(next_word)
                                dGcol.node[next_word]['count'] = 1
                            else:
                                dGcol.node[next_word]['count'] +=1
                            if not dGcol.has_edge(word, next_word):
                                dGcol.add_edge(word, next_word, weight = 1)
                                # dGcol.edge[word][next_word]['w2vec'] = 0.01
                                if tf_par=="word2vec":
                                    if word in model.wv.vocab and next_word in model.wv.vocab:
                                        dGcol.edge[word][next_word]['w2vec'] = model.wv.similarity(word,next_word)
                                        # dGcol.edge[word][next_word]['w2vec'] = np.linalg.norm(model[word]-model[next_word])
                            else:
                                dGcol.edge[word][next_word]['weight'] += 1
                            ## this is for label graphs
                            if not dGlabels[lg].has_node(word):
                                dGlabels[lg].add_node(word)
                                dGlabels[lg].node[word]['count'] = 1
                            else:
                                dGlabels[lg].node[word]['count'] += 1
                            if not dGlabels[lg].has_node(next_word):
                                dGlabels[lg].add_node(next_word)
                                dGlabels[lg].node[next_word]['count'] = 1
                            else:
                                dGlabels[lg].node[next_word]['count'] +=1

                            if not dGlabels[lg].has_edge(word, next_word):
                                dGlabels[lg].add_edge(word, next_word, weight = 1)
                                # dGcol.edge[word][next_word]['w2vec'] = 0.01
                                if tf_par=="word2vec":

                                    if word in model.wv.vocab and next_word in model.wv.vocab:
                                        dGlabels[lg].edge[word][next_word]['w2vec'] = model.wv.similarity(word,next_word)
                            else:
                                dGlabels[lg].edge[word][next_word]['weight'] += 1
                            # # # again for average,5,6,7,8,9
                            # if not dG.has_edge(word, next_word):
                            #     dG.add_edge(word, next_word, weight = 1)
                            # else:
                            #     dG.edge[word][next_word]['weight'] += 1
                        except IndexError:
                            if not dGcol.has_node(word):
                                dGcol.add_node(word)
                                dGcol.node[word]['count'] = 1
                            else:
                                dGcol.node[word]['count'] += 1
                            if not dGlabels[lg].has_node(word):
                                dGlabels[lg].add_node(word)
                                dGlabels[lg].node[word]['count'] = 1
                            else:
                                dGlabels[lg].node[word]['count'] += 1
                        except:
                            raise
                # nx.draw(dG,pos=nx.spring_layout(dG))
                # plt.show()
                # nx.write_edgelist(dG,path+"_YO_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist")
                # raw_input("enter")
                # totalDiam += nx.diameter(dG)
            # nx.write_edgelist(dGcol,path+"_collection_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist")
            # json.dump(term_num_docs,open(path+"_term_num_docs"+str(sliding_window)+".txt","w"))
        else:
            print("Parsing the graph of words for collection...")
            # term_num_docs = json.load(open(path+"_term_num_docs"+str(sliding_window)+".txt","r"))
            # dGcol = nx.read_edgelist(path+"_collection_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist")

        print("Number of self-loops for collection graph:"+str(dGcol.number_of_selfloops()))
        dGcol.remove_edges_from(dGcol.selfloop_edges())
        collection_count_nodes = dGcol.number_of_nodes()
        collection_count_edges = dGcol.number_of_edges()
        print("Number of nodes in collection graph:"+str(collection_count_nodes))
        print("Number of edges in collection graph:"+str(collection_count_edges))

        # plot_degree_histogram(dGcol)
        # raw_input("enter")

        # avgLen = float(totalLen)/num_documents
        avgLen = 0
        # colDiam = nx.diameter(dGcol)
        # avgDiam = float(totalDiam)/num_documents

        print("Average document length:"+str(avgLen))

        if idf_par=="icw" or idf_par=="icw+idf" or idf_par=="tf-icw" or idf_par=="icw-lw":
            icw_col = {}
            if tf_par=="word2vec":
                for u,v,d in dGcol.edges(data=True):
                    if 'w2vec' in d:
                        ## my w2v similarity
                        dGcol.edge[u][v]['w2vec'] = np.arccos(d['w2vec'])/math.pi
                        dGcol.edge[u][v]['w2vec'] = 1-dGcol.edge[u][v]['w2vec']
                        dGcol.edge[u][v]['weight'] = dGcol.edge[u][v]['w2vec']
                        ## attraction score
                        # f_u_v = float(dGcol.node[u]['count']*dGcol.node[v]['count'])/(d['w2vec']**2)
                        # dice = float(2*d['weight'])/(dGcol.node[u]['count']+dGcol.node[v]['count'])
                        # dGcol.edge[u][v]['weight'] = f_u_v * dice

                        #dGcol.edge[u][v]['weight'] = d['weight']*dGcol.edge[u][v]['w2vec']
                        #dGcol.edge[u][v]['weight'] = float(d['weight'])/(dGcol.edge[u][v]['w2vec']**2)
                    else:
                        # dGcol.edge[u][v]['weight'] = np.arccos(0.0001)/math.pi
                        dGcol.edge[u][v]['weight'] = 0.0001


            if centrality_col_par == "degree_centrality":
                centrality_col = nx.degree_centrality(dGcol)
            elif centrality_col_par == "weighted_degree_centrality":
                # centrality_col = nx.degree_centrality(dGcol,weight='weight')
                centrality_col = dGcol.degree(weight='weight')
            elif centrality_col_par=="in_degree_centrality":
                centrality_col = nx.in_degree_centrality(dGcol)
            elif centrality_col_par=="out_degree_centrality":
                centrality_col = nx.out_degree_centrality(dGcol)
            elif centrality_col_par == "pagerank_centrality":
                centrality_col = nx.pagerank(dGcol)
            elif centrality_col_par == "weighted_pagerank_centrality":
                centrality_col = nx.pagerank(dGcol,weight="weight")
            elif centrality_col_par == "eigenvector_centrality":
                centrality_col = nx.eigenvector_centrality(dGcol,max_iter=1000)
            elif centrality_col_par == "betweenness_centrality" or centrality_col_par=="betweenness_centrality_directed":
                centrality_col = nx.betweenness_centrality(dGcol)
            elif centrality_col_par == "triangles":
                centrality_col = nx.triangles(dGcol)
            elif centrality_col_par == "clustering_coefficient":
                centrality_col = nx.clustering(dGcol)
            elif centrality_col_par == "core_number":
                centrality_col = nx.core_number(dGcol)
            elif centrality_col_par == "closeness_centrality" or centrality_col_par=="closeness_centrality_directed":
                centrality_col = nx.closeness_centrality(dGcol)
            elif centrality_col_par == "closeness_centrality_weighted":
                centrality_col = nx.closeness_centrality(dGcol)
            elif centrality_col_par == "communicability_centrality":
                centrality_col = nx.communicability_centrality(dGcol)


            centrality_labels = []

            # partition = community.best_partition(dGcol)
            #
            # all_nodes = []
            # partitions = []
            # count = 0
            # for com in set(partition.values()):
            #     count = count + 1
            #     list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
            #     partitions.append(list_nodes)
            #
            #
            # print("Clusters:"+str(len(partitions))
            #
            #
            # lens = [len(partition) for partition in partitions]
            # print(lens
            # t = lens.index(max(lens))
            #
            # print("len of biggest cluster:"+str(len(partitions[t]))

            # raw_input("enter")

            for i, dGlabel in enumerate(dGlabels):
                # centrality_labels.append(nx.pagerank(dGlabel))
                # centrality_labels.append(nx.degree_centrality(dGlabel))
                # print("before:"+str(dGlabel.number_of_nodes())

                ## this is for clustering
                # partition = community.best_partition(dGlabel)
                # all_nodes = []
                # count = 0
                # for com in set(partition.values()):
                #     count = count + 1
                #     list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
                #     all_nodes.append(list_nodes)
                #
                # partitions.append(all_nodes)
                #
                # print(str(i)+": "+str(count)+" clusters" )
                # G = dGlabel.copy()
                # setA = set(dGlabel.nodes())
                # setB = set(partitions[t])
                # dGlabel.remove_nodes_from(list(setB))
                # setB = set(partitions[1])
                # dGlabel.remove_nodes_from(list(setB))
                # setB = set(partitions[1])
                # dGlabel.remove_nodes_from(list(setB))
                print("after:"+str(dGlabel.number_of_nodes()))
                centrality_labels.append(nx.degree_centrality(dGlabel))
                # raw_input("en")
            centr_sum = sum(centrality_col.values())
            # centr_sum = max(centrality_col.values())
            # print(centr_sum)
            minc = [min(d.values()) for d in centrality_labels]
            minc = min(minc)
            for k,g in enumerate(dGcol.nodes()):
                if centrality_col[g]!=0:
                    if idf_par=="icw" or idf_par=="tf-icw" or idf_par=="icw+idf":
                        #print(centrality_col[g])
                        # icw_col[g] = math.log10(float(centr_sum*num_documents)/(centrality_col[g]*term_num_docs[g]))
                        # print(g)
                        seq = [x.get(g, 0) for x in centrality_labels]
                        centr_max_c = max(seq)
                        ind_max = seq.index(centr_max_c)
                        # print(g)
                        # topics = []
                        # for i, partition in enumerate(partitions):
                        # for w in partitions[ind_max]:
                        #     if g in w:
                        #         topics = w
                        # print(str(topics))
                        # raw_input("enter")
                        # all_words = centrality_labels[ind_max].keys -
                        # sum_all_topics = sum([centrality_labels[ind_max].get(word, 0) for word in topics])
                        # sum_all_topics = sum([centrality_col.get(word, 0) for word in topics])
                        # G = dGlabels[ind_max].copy()
                        # # print("before:"+str(G.number_of_nodes())
                        # G = dGcol.copy()
                        # setA = set(G.nodes())
                        # setB = set(partitions[t])
                        # G.remove_nodes_from(list(setB))
                        # # G.remove_nodes_from(list(setB))
                        #
                        # # print("after:"+str(G.number_of_nodes())
                        # if G.degree(g):
                        #     centr_max_c = G.degree(g)
                        # # print(sum_all_topics)
                        # # raw_input("enter")
                        centr_sum_c = sum(seq)
                        n_el = sum(s>0 for s in seq)
                        # dGlab = seq.index(centr_max_c)
                        del seq[ind_max]
                        # centr_sum_lab = sum(seq)
                        # print(seq)
                        # raw_input("enter")
                        term_graphs = []
                        for j,doc in enumerate(dGdocs.nodes()):
                            if g in clean_train_documents[j].split():
                                term_graphs.append(dGdocs.degree(j,weight='weight'))
                        avg_term = np.mean(term_graphs)
                        # print(avg_term)
                        max_term = sum(term_graphs)
                        #.
                        # icw_col[g] = math.log10((float(centr_sum)/centrality_col[g]) * (float(max_term)/avg_term))
                        # icw_col[g] = math.log10(float(max_term)/avg_term)
                        icw_col[g] = math.log10((float(centr_sum)/centrality_col[g]) * (float(centr_max_c)/max(np.mean(seq),minc)))
                        # a = np.mean(seq)
                        # crc = 2 + ((centr_max_c/max(a,minc)*(float(len(centrality_labels))/n_el)))
                        # icw_col[g] = math.log(crc,2)
                        # icw_col[g] = math.log10((float(centr_sum)/centrality_col[g])) * math.log(crc,2)
                    elif idf_par=="icw-lw":
                        icw_col[g] = math.log10((float(centr_sum)/centrality_col[g]))
                else:
                    icw_col[g] = 0
        # elif idf_par=="idf":
        idf_col = {}
        if idf_par=="idf" or idf_par=="icw+idf":
            for x in term_num_docs:
                idf_col[x] = math.log10(float(num_documents) / term_num_docs[x])
        dGcol_nodes = dGcol.nodes()
        #save_graph(dGcol, "graph_split.pdf")
        dGcol.clear()
    # for the testing set
    else:

        if idf_par=="idf":
            idf_col = idf_learned
        elif idf_par=="icw" or idf_par=="tf-icw":
            icw_col = icw_learned
        elif idf_par=="icw+idf":
            idf_col = idf_learned
            icw_col = icw_learned
        collection_count_nodes = 0
        collection_count_edges = 0

    totalNodes = 0
    totalEdges = 0
    corrs_per_category = [[] for i in range(4)]
    counter_word2vec = []
    # print("number of word2vec words in docs:"+str(len(counter_word2vec))

    if idf_par=="no":
        idfs = {}
        icws = {}
    if idf_par=="idf":
        idfs = idf_col
        icws = {}
    elif idf_par=="icw" or idf_par=="tf-icw" or idf_par=="icw-lw":
        idfs = {}
        icws = icw_col
    elif idf_par=="icw+idf":
        idfs = idf_col
        icws = icw_col

    processes = cpu_count()
    # processes=1
    all_pairs,idx = chunkIt(clean_train_documents,processes)

    y_final = []

    pool = Pool(processes)
    print("Number of processes:"+str(processes))
    results = [pool.apply_async( splitGraphFeatures, (t, idx[k], idf_par, centrality_par, dGcol_nodes, idfs, icws, sliding_window, unique_words, train_par,path)) for k, t in enumerate(all_pairs)]
    count_rows = 0

    for i,result in enumerate(results):
        r,y = result.get()
        for y_ind,row in enumerate(r):
            features[count_rows,:] = row[:]
            #y_final.append(y_train[y_ind])
            count_rows += 1
    pool.close()

    # if train_par:
        # print("Average number of nodes:"+str(float(totalNodes)/num_documents)
        # print("Average number of edges:"+str(float(totalEdges)/num_documents)

    # all_pairs,idx = chunkIt(clean_train_documents,1)
    # r,y = splitGraphFeatures(all_pairs[0],idx[0], idf_par,centrality_par, dGcol_nodes,idfs,icws, sliding_window,unique_words,train_par,path)
    #
    # count_rows = 0
    # for y_ind,row in enumerate(r):
    #     features[count_rows,:] = row[:]
    #     count_rows += 1

    return features, idfs, icws, collection_count_nodes, collection_count_edges, dGcol_nodes, avgLen
Esempio n. 36
0
def splitGraphFeatures(documents,idx,idf_par,centrality_par,dGcol_nodes, idf_col,icw_col,sliding_window,unique_words,train_par,path):
    features = np.zeros((len(documents),len(unique_words)))
    # features = csr_matrix((len(documents),len(unique_words)))
    # features = lil_matrix((len(documents),len(unique_words)))
    if centrality_par=="weighted_degree_centrality" or centrality_par=="weighted_pagerank_centrality":
        tf_par = "word2vec"
        global model
    else:
        tf_par = "word2ve"
    if not train_par:
        path = path+"test_"
    for i, doc in enumerate(documents):
        ind = idx[i]
        if not os.path.exists(path+str(ind)+"_sliding_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist"):
            # print("Creating the graph of words for documents...")
            if centrality_par=="pagerank_centrality" or centrality_par=="in_degree_centrality" or centrality_par=="out_degree_centrality" or centrality_par=="closeness_centrality_directed" or centrality_par=="betweenness_centrality_directed" or centrality_par=="weighted_pagerank_centrality":
                dG = nx.DiGraph()
            else:
                dG = nx.Graph()
            wordList1 = doc.split(None)
            wordList2 = [x.rstrip(',.!?;') for x in wordList1]
            docLen = len(wordList2)
            #if len(wordList2)>1:
            for k, word in enumerate(wordList2):
                for j in range(1,sliding_window):
                    try:
                        next_word = wordList2[k + j]
                        if not dG.has_node(word):
                            dG.add_node(word)
                            dG.node[word]['count'] = 1
                        else:
                            dG.node[word]['count'] += 1
                        if not dG.has_node(next_word):
                            dG.add_node(next_word)
                            dG.node[next_word]['count'] = 1
                        else:
                            dG.node[next_word]['count'] += 1
                        if not dG.has_edge(word, next_word):
                            dG.add_edge(word, next_word, weight = 1)
                            # dG.edge[word][next_word]['w2vec'] = 0.0001
                            if tf_par=="word2vec":
                                if word in model.wv.vocab and next_word in model.wv.vocab:
                                    dG.edge[word][next_word]['w2vec'] = model.wv.similarity(word,next_word)
                                    # dG.edge[word][next_word]['w2vec'] = np.linalg.norm(model[word]-model[next_word])
                        else:
                            dG.edge[word][next_word]['weight'] += 1

                    except IndexError:
                        if not dG.has_node(word):
                            dG.add_node(word)
                            dG.node[word]['count'] = 1
                        else:
                            dG.node[word]['count'] += 1
                    except:
                        raise
            dG.remove_edges_from(dG.selfloop_edges())
            # for node1, node2 in dG.edges_iter():
            #     dG.edge[node1][node2]['inv_weight'] = 1.0 / dG.edge[node1][node2]['weight']
                    ## best until now
                    # d['weight'] = d['weight']*((d['w2vec'])**2)
                    # d['weight'] = dice*f
            # nx.write_edgelist(dG,path+str(ind)+"_sliding_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist",data=True)
        else:
            print("Parsing the graph of words for documents...")
            # dG = nx.read_edgelist(path+str(ind)+"_sliding_"+str(sliding_window)+"_"+str(tf_par)+"_graph.edgelist")
        if tf_par=="word2vec":
            for u,v,d in dG.edges(data=True):
                if 'w2vec' in d:
                    # dice = (2*d['weight'])/(dG.node[u]['count']+dG.node[v]['count'])
                    # dG.edge[u][v]['weight'] = dice * (dG.node[u]['count']*dG.node[v]['count'])/((d['w2vec'])**2)
                    # d['weight'] = (dG.node[u]['count']*dG.node[v]['count'])/((1-d['w2vec']))
                    ## angular
                    # dice = (2*d['weight'])/(dG.node[u]['count']+dG.node[v]['count'])
                    # f = (dG.node[u]['count']*dG.node[v]['count'])/(d['w2vec']**2)
                    # print(d['w2vec']
                    # d['weight'] = d['weight']/(d['w2vec'])
                    # if u not in counter_word2vec:
                    #     counter_word2vec.append(u)
                    #
                    # if v not in counter_word2vec:
                    #     counter_word2vec.append(v)

                    ## my_w2v_similarity
                    dG.edge[u][v]['w2vec'] = np.arccos(d['w2vec'])/math.pi
                    dG.edge[u][v]['w2vec'] = 1-dG.edge[u][v]['w2vec']
                    dG.edge[u][v]['weight'] = dG.edge[u][v]['w2vec']

                    ## attraction score
                    # d['w2vec'] = np.arccos(d['w2vec'])/math.pi
                    # f_u_v = float(dG.node[u]['count']*dG.node[v]['count'])/(d['w2vec']**2)
                    # dice = float(2*d['weight'])/(dG.node[u]['count']+dG.node[v]['count'])
                    # dG.edge[u][v]['weight'] = f_u_v * dice

                else:
                    dG.edge[u][v]['weight'] = 0.0001
                    # dG.edge[u][v]['weight'] = 1-dG.edge[u][v]['weight']
        #if len(dG)>1:
        if centrality_par == "degree_centrality":
            centrality = nx.degree_centrality(dG)
        elif centrality_par == "weighted_degree_centrality":
            centrality = dG.degree(weight="weight")
            # centrality = weighted_degree_centrality(dG)
        elif centrality_par == "in_degree_centrality":
            centrality = nx.in_degree_centrality(dG)
        elif centrality_par == "out_degree_centrality":
            centrality = nx.out_degree_centrality(dG)
        elif centrality_par == "pagerank_centrality":
            centrality = nx.pagerank(dG)
        elif centrality_par == "weighted_pagerank_centrality":
            centrality = nx.pagerank(dG,weight="weight")
        elif centrality_par =="betweenness_centrality" or centrality_par=="betweenness_centrality_directed":
            centrality = nx.betweenness_centrality(dG,weight="weight")
        elif centrality_par =="triangles":
            centrality = nx.triangles(dG)
        elif centrality_par =="eigenvector_centrality":
            centrality = nx.eigenvector_centrality_numpy(dG)
        elif centrality_par =="core_number":
            centrality = nx.core_number(dG)
        elif centrality_par =="clustering_coefficient":
            centrality = nx.clustering(dG)
        elif centrality_par == "closeness_centrality" or centrality_par=="closeness_centrality_directed":
            centrality = nx.closeness_centrality(dG)
        elif centrality_par == "closeness_centrality_weighted":
            centrality = nx.closeness_centrality(dG,distance='weight')
        elif centrality_par == "communicability_centrality":
            centrality = nx.communicability_centrality(dG)
        elif centrality_par == "closeness_centrality_not_normalized":
            centrality = nx.closeness_centrality(dG,normalized=False)

        #print("Number of self-loops:"+str(dG.number_of_selfloops())
        #centrality = nx.out_degree_centrality(dG)
        #centrality = pg.pagerank(dG,max_iter=1000)
        #centrality = nx.katz_centrality(dG,max_iter=10000)

        # totalNodes += dG.number_of_nodes()
        # totalEdges += dG.number_of_edges()

        #if len(dG)>1:
        for k, g in enumerate(dG.nodes()):
            if g in dGcol_nodes:
                if idf_par=="no":
                    features[i,unique_words.index(g)] = centrality[g]
                    #tfs.append(wordList2.count(g))
                    # centralities.append(centrality[g])
                elif idf_par=="tf-icw":
                    #tf_g = 1+math.log(1+math.log(wordList2.count(g)))
                    tf_g = wordList2.count(g)
                    # features[i,unique_words.index(g)] = (tf_g/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g]
                    features[i,unique_words.index(g)] = tf_g * icw_col[g]
                elif idf_par=="idf":
                    features[i,unique_words.index(g)] = centrality[g] * idf_col[g]
                    # features[i,unique_words.index(g)] = centrality[g] * idf_col[g]
                elif idf_par=="icw" or idf_par=="icw-lw":
                    features[i,unique_words.index(g)] = centrality[g] * icw_col[g]
                    # features[i,unique_words.index(g)] = centrality[g]/(1-b+(b*(float(docDiam)/avgDiam))) * icw_col[g]
                elif idf_par=="icw+idf":
                    tf_g = wordList2.count(g)
                    #features[i,unique_words.index(g)] = (centrality[g]/(1-b+(b*(float(docLen)/avgLen)))) * icw_col[g] * idf_col[g]
                    features[i,unique_words.index(g)] = centrality[g] * icw_col[g] * idf_col[g]
        #save_graph(dG, "my_graph.pdf")
        dG.clear()
    return features,idx
Esempio n. 37
0
sys.stdout.write(" done\n")

sys.stdout.write("calculating PageRank Centrality . . .")
PageRankDict = nx.pagerank(G)
sys.stdout.write(" done\n")

sys.stdout.write("calculating Closeness Centrality . . .")
ClosenessDict = nx.closeness_centrality(G)
sys.stdout.write(" done\n")

sys.stdout.write("calculating Betweenness Centrality . . .")
BetweennessDict = nx.betweenness_centrality(G)
sys.stdout.write(" done\n")

sys.stdout.write("calculating Communicability Centrality . . .")
CommunicabilityDict = nx.communicability_centrality(G)
sys.stdout.write(" done\n")

print "=" * 100

import re
csvRegion = list(csv.reader(open("Regions.csv")))
regions = {}
for i in range(1, len(csvRegion)):
    regions[csvRegion[i][2].strip()] = [
        re.sub(r'(,)', '/', csvRegion[i][3]),
        re.sub(r'(,)', '/', csvRegion[i][4])
    ]

import matplotlib.pyplot as plt
nx.draw(G)  # networkx draw()
Esempio n. 38
0
def calculate_centrality(G):
    # dc_dumps = json.dumps(nx.degree_centrality(G).items(),sort_keys=True,indent=4)
    # dc_loads = json.loads(dc_dumps)
    dc_sorted = sorted(nx.degree_centrality(G).items(),
                       key=itemgetter(0),
                       reverse=True)
    bc_sorted = sorted(nx.betweenness_centrality(G).items(),
                       key=itemgetter(0),
                       reverse=True)
    clc_sorted = sorted(nx.closeness_centrality(G).items(),
                        key=itemgetter(0),
                        reverse=True)
    coc_sorted = sorted(nx.communicability_centrality(G).items(),
                        key=itemgetter(0),
                        reverse=True)
    lc_sorted = sorted(nx.load_centrality(G).items(),
                       key=itemgetter(0),
                       reverse=True)
    cfbc_sorted = sorted(nx.current_flow_betweenness_centrality(G).items(),
                         key=itemgetter(0),
                         reverse=True)
    cfcc_sorted = sorted(nx.current_flow_closeness_centrality(G).items(),
                         key=itemgetter(0),
                         reverse=True)
    # print ec_sorted[0]

    developer_centrality = []

    developer_file = file("public/wordpress/developer.json")
    developers = json.load(developer_file)
    for developer in developers:
        degree = 0
        betweenness = 0
        closeness = 0
        communicability = 0
        load = 0
        current_flow_betweenness = 0
        current_flow_closeness = 0
        for i in range(0, len(dc_sorted)):
            # if ( not dc_sorted[i][0] == bc_sorted[i][0] == clc_sorted[i][0] == coc_sorted[i][0] == lc_sorted[i][0] == cfbc_sorted[i][0]):
            # 	print 'false'
            if (developer['developer'] == dc_sorted[i][0]):
                degree = dc_sorted[i][1]
                betweenness = bc_sorted[i][1]
                closeness = clc_sorted[i][1]
                communicability = coc_sorted[i][1]
                load = lc_sorted[i][1]
                current_flow_betweenness = cfbc_sorted[i][1]
                current_flow_closeness = cfcc_sorted[i][1]

        developer_centrality.append({
            'name':
            developer['developer'],
            'degree':
            degree,
            'betweenness':
            betweenness,
            'closeness':
            closeness,
            'communicability':
            communicability,
            'load':
            load,
            'current_flow_betweenness':
            current_flow_betweenness,
            'current_flow_closeness':
            current_flow_closeness,
        })

    return developer_centrality
Esempio n. 39
0
    details = f.read().split('\n')
    for line in details:
        if len(line) > 0 and line[0] != '#':
            node0 = line.split(' ')[0]
            node1 = line.split(' ')[1].split('\t')[0]
            weight = line.split('\t')[1]
            collab_graph.add_edge(int(node0), int(node1), weight=int(weight))

distances = distance_from_erdos(collab_graph, 1095)
output_centralities(distances, 'Distance_From_Erdos')

# Calculate and output the centralities
degree_centrality = nx.degree_centrality(collab_graph)
output_centralities(degree_centrality, 'Degree_Centrality')

degree_centrality_weighted = weighted_degree_centrality(collab_graph)
output_centralities(degree_centrality_weighted, 'Weighted_Degree_Centrality')

normalised_closeness_centrality = nx.closeness_centrality(collab_graph)
output_centralities(normalised_closeness_centrality, "Closeness_Centrality")

betweenness_centrality = nx.betweenness_centrality(collab_graph,
                                                   endpoints=True)
output_centralities(betweenness_centrality, "Betweenness_Centrality")

katz_centrality = nx.katz_centrality(collab_graph, alpha=0.005)
output_centralities(katz_centrality, "Katz_Centrality")

communicability_centrality = nx.communicability_centrality(collab_graph)
output_centralities(communicability_centrality, "Communicability_Centrality")
    def avg_communicability_centrality(self):
        """
		Communicability centrality, also called subgraph centrality, of a node n is the sum of closed walks of all lengths starting and ending at node n.
		"""
        return sum(nx.communicability_centrality(self.graph).values()) / self.n
Esempio n. 41
0
n = 80
p = 10. / n
G = nx.fast_gnp_random_graph(n, p, seed=42)


def to_list(dict_):
    return [dict_[k] for k in G.nodes()]


graph_colors = [
    ("degree", to_list(nx.degree_centrality(G))),
    ("betweenness", to_list(nx.betweenness_centrality(G))),
    ("load", to_list(nx.load_centrality(G))),
    ("eigenvector", to_list(nx.eigenvector_centrality_numpy(G))),
    ("closeness_centrality", to_list(nx.closeness_centrality(G))),
    ("current_flow_closeness",
     to_list(nx.current_flow_closeness_centrality(G))),
    ("current_flow_betweenness",
     to_list(nx.current_flow_betweenness_centrality(G))),
    ("katz", to_list(nx.katz_centrality_numpy(G))),
    ("communicability", to_list(nx.communicability_centrality(G))),
]

fig = plot_multigraph.plot_color_multigraph(G,
                                            graph_colors,
                                            3,
                                            3,
                                            node_size=50)
plt.savefig('graphs/centrality.png', facecolor=fig.get_facecolor())
Esempio n. 42
0
def g2o(input_graph, degree_threshold, step_size, heuristic="degree"):

    ## heuristic selection
    if heuristic == "degree":
        heuristic_hash = input_graph.degree()
    elif heuristic == "pagerank":
        heuristic_hash = nx.pagerank_numpy(input_graph, alpha=0.9)
    elif heuristic == "pagerank_scipy":
        heuristic_hash = nx.pagerank_scipy(input_graph, alpha=0.9)
    elif heuristic == "eigenvector":
        heuristic_hash = nx.eigenvector_centrality_numpy(input_graph)
    elif heuristic == "communicability":
        heuristic_hash = nx.communicability_centrality(input_graph)
    elif heuristic == "flow_betweenness":
        heuristic_hash = nx.current_flow_betweenness_centrality(input_graph)
    elif heuristic == "closeness":
        heuristic_hash = nx.closeness_centrality(input_graph)
    elif heuristic == "betweenness":
        heuristic_hash = nx.betweenness_centrality(input_graph)
    else:
        raise ValueError("Please select a valid heuristic..")

    ## first identify the triplets
    G = input_graph
    result_triplets = []
    crossed = set()
    for node in G:
        crossed.add(node)
        done_count = set()
        neighbours = set(G[node])

        for neigh in neighbours:
            if neigh in crossed:
                continue
            done_count.add(neigh)
            for both in neighbours.intersection(G[neigh]):
                if both in crossed or both in done_count:
                    continue
                result_triplets.append((node, neigh, both))

    ## remove triplets in some manner
    for triplet in result_triplets:

        ## get the node degrees
        triplet_degrees = {heuristic_hash[node]: node for node in triplet}
        sorted_keys = sorted(list(triplet_degrees.keys()))
        if len(sorted_keys) == 3:
            try:
                input_graph.remove_edge(triplet_degrees[sorted_keys[0]],
                                        triplet_degrees[sorted_keys[1]])
            except:
                ## not all keys exist
                pass

    outgraph = nx.DiGraph()
    degree_list = [heuristic_hash[deg] for deg in heuristic_hash]
    threshold_degree = np.percentile(degree_list, degree_threshold)
    candidate_hotspots = [
        node for node, value in heuristic_hash.items()
        if value > threshold_degree
    ]

    print("Nodes to begin the iteration: ", len(candidate_hotspots))

    ## a queue of nodes to be processed..
    to_process = []

    ## a list of already processed nodes..
    already_processed = []

    ## initiate the nodes
    for node in candidate_hotspots:
        to_process.insert(0, node)

    while len(to_process) != 0:
        for step in (range(0, int(step_size))):
            ## go to a specific depth
            if len(to_process) != 0:
                start_node = to_process.pop()
            else:
                break
            if start_node not in already_processed:
                already_processed.append(start_node)
                for neigh in set(input_graph[start_node]):
                    if neigh not in already_processed and neigh not in candidate_hotspots:
                        ## Querying
                        if step > 0:
                            to_process.append(neigh)
                        else:
                            to_process.insert(0, neigh)
                        ## Edge construction step
                        if heuristic_hash[neigh] < heuristic_hash[start_node]:
                            outgraph.add_edge(start_node, neigh)
                        else:
                            outgraph.add_edge(neigh, start_node)

    print(nx.info(outgraph))
    if nx.is_directed_acyclic_graph(outgraph):
        return outgraph
    else:
        raise ValueError('Graph could not be converted to a DAG.')
Esempio n. 43
0
import networkx as nx
import plot_multigraph
import matplotlib.pylab as plt
from matplotlib import pylab as plt

n = 80
p = 10. / n
G = nx.fast_gnp_random_graph(n, p, seed=42)

def to_list(dict_):
  return [dict_[k] for k in G.nodes()]

graph_colors = [
  ("degree", to_list(nx.degree_centrality(G))),
  ("betweenness", to_list(nx.betweenness_centrality(G))),
  ("load", to_list(nx.load_centrality(G))),
  ("eigenvector", to_list(nx.eigenvector_centrality_numpy(G))),
  ("closeness_centrality", to_list(nx.closeness_centrality(G))),
  ("current_flow_closeness", to_list(nx.current_flow_closeness_centrality(G))),
  ("current_flow_betweenness", to_list(nx.current_flow_betweenness_centrality(G))),
  ("katz", to_list(nx.katz_centrality_numpy(G))),
  ("communicability", to_list(nx.communicability_centrality(G))),
]

fig = plot_multigraph.plot_color_multigraph(G, graph_colors, 3, 3, node_size=50)
plt.savefig('graphs/centrality.png', facecolor=fig.get_facecolor())
def com_center(net):
    return distriCentra(nx.communicability_centrality(net).values(),
                        nx.communicability_centrality(star(net)).values(),
                        'communicability')
def make_net(centrality_name, in_path, out_path):
	#sample code
		#import _2_time_based_data_network_feature
		#make_net_in_path = "../3.time_based_data/1.cite_relation_devide/"
		#make_net_out_path = "../3.time_based_data/2.centrality_data/"
		#_2_time_based_data.make_net( "in_degree", make_net_in_path, make_net_out_path)

	#네트워크를 만들고 Centurality를 계산하고 저장할 것이다.
	import networkx as nx
	global Dump
	Dump = {}
	make_net_initialize(in_path)
	start_time = time.time()
	temp_start_time = time.time()

	print "=============		make_net start:" + centrality_name + "		=============="
	print "=============		from 1951 to 2015		=============="

	for year in range(1951, 2016):
		print year
		f_in = open(in_path + str(year) + "_cite.csv","r")
		lines = f_in.readlines()
		f_in.close()
		edge_list = []

		for line in lines:
			data = line.split(",")
			data_tuple = (data[0].strip(), data[1].strip())
			edge_list.append(data_tuple)

		Net = nx.DiGraph(edge_list)
		Cen_in = {}
		if (centrality_name == "in_degree"):
			Cen_in = nx.in_degree_centrality(Net)
		elif (centrality_name == "degree"):
			Cen_in = nx.degree_centrality(Net)
		elif (centrality_name == "eigenvector"):
			Cen_in = nx.eigenvector_centrality_numpy(Net)
		elif (centrality_name == "katz"):
			Cen_in = nx.katz_centrality(Net)
		elif (centrality_name == "pagerank"):
			Cen_in = nx.pagerank(Net)
		elif (centrality_name == "communicability"):
			Net = nx.Graph(edge_list)
			Cen_in = nx.communicability_centrality(Net)
		elif (centrality_name == "load"):
			Cen_in = nx.load_centrality(Net)
		
		for j in Cen_in:
			key = j
			val = Cen_in[j]
			Dump[key][year] = val

	#저장하는 코드 
	f_out = open(out_path + centrality_name +"_centrality.csv", "w")
	for key in Dump:
		line = str(key)
		for year in range(1951, 2016):
			data = Dump[key].get(year, 0)
			line = line + ","+ str(data)
		line = line + "\n"
		f_out.write(line)
	f_out.close()

	print "=============		make_net end			=============="
	print(centrality_name + "takes %s seconds" % (time.time() - temp_start_time))
	temp_start_time = time.time()
def com_center(net):
    return distriCentra(
        nx.communicability_centrality(net).values(),
        nx.communicability_centrality(star(net)).values(), 'communicability')
sys.stdout.write(" done\n")

sys.stdout.write("calculating PageRank Centrality . . .")
PageRankDict = nx.pagerank(G)
sys.stdout.write(" done\n")

sys.stdout.write("calculating Closeness Centrality . . .")
ClosenessDict = nx.closeness_centrality(G)
sys.stdout.write(" done\n")

sys.stdout.write("calculating Betweenness Centrality . . .")
BetweennessDict = nx.betweenness_centrality(G)
sys.stdout.write(" done\n")

sys.stdout.write("calculating Communicability Centrality . . .")
CommunicabilityDict= nx.communicability_centrality(G)
sys.stdout.write(" done\n")

print "="*100

import re
csvRegion = list(csv.reader(open("Regions.csv")))
regions = {}
for i in range(1,len(csvRegion)) :
    regions[csvRegion[i][2].strip()] = [re.sub(r'(,)', '/',csvRegion[i][3]), re.sub(r'(,)', '/',csvRegion[i][4])]

import matplotlib.pyplot as plt
nx.draw(G)  # networkx draw()
plt.show()

sys.stdout.write("Clustering by depth 3 . . .")
Esempio n. 48
0
	def communicability_centrality_sum(self):
		if (self.communicability_centrality_dict == None):
			self.communicability_centrality_dict = nx.communicability_centrality(self.graph)
		return self.communicability_centrality_dict[self.node_1] + self.communicability_centrality_dict[self.node_2]