forked from sashagutfraind/musketeer
/
graphutils.py
1250 lines (1075 loc) · 45.3 KB
/
graphutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
'''
Multiscale Entropic Network Generator 2 (MUSKETEER2)
Copyright (c) 2011-2015 by Alexander Gutfraind and Ilya Safro.
All rights reserved.
Use and redistribution of this file is governed by the license terms in
the LICENSE file found in the project's top-level directory.
Code to assist the generator
'''
'''
Contains code derived from NetworkX - Copyright (c) by various authors.
'''
import os
import time
import numpy as np
import scipy.sparse
import numpy.random as npr
import random, sys
import networkx as nx
import pdb
import cPickle
import algorithms
import community
np.seterr(all='raise')
timeNow = lambda : time.strftime('%Y_%m_%d__%H_%M_%S', time.localtime())
MUSKETEER_EXAMPLE_CMD = 'python musketeer.py -p "{\'edge_edit_rate\':[0.1,0.01]}" -f data-samples/karate.adjlist -t adjlist -o output/karate_replica.edges'
None_node = None #key to indicate a not real node.
#those are short methods (like lambda, but supporting pickling)
def a_avg_degree(G):
return np.average(nx.degree(G).values())
def a_degree_connectivity(G):
return np.average(nx.average_degree_connectivity(G).values())
def a_s_metric(G):
return nx.s_metric(G, normalized=False)
def a_eccentricity(G):
return np.average(nx.eccentricity(G).values())
def a_avg_shortest(G):
return average_all_pairs_shortest_path_estimate(G, max_num_sources=100)
def a_avg_harmonic(G):
return average_all_pairs_inverse_shortest_path_estimate(G, max_num_sources=100)
def a_avg_between(G):
return np.average(nx.betweenness_centrality(G, normalized=True).values())
METRIC_ERROR = -999999
#any negative value is suitable, but not None or -np.inf (might complicate code for statistical analysis)
def average_all_pairs_shortest_path(G):
if nx.number_connected_components(G)>1:
return METRIC_ERROR
true_d = nx.all_pairs_shortest_path_length(G)
nx_sum = 0.0
reachable_pairs = 0.0
for key1 in true_d:
for key2 in true_d[key1]:
nx_sum += true_d[key1][key2]
reachable_pairs += 1.0
nx_mean_distance = nx_sum/reachable_pairs
return nx_mean_distance
def average_all_pairs_shortest_path_GT(G):
if nx.number_connected_components(G)>1:
return METRIC_ERROR
import graph_tool, graph_tool.topology
gtG = make_gt_graph(G)
#gtDistances = graph_tool.topology.shortest_distance(g=gtG['G'], source=None, target=None, weights=gtG['edge_weights'], dense=True)
gtDistances = graph_tool.topology.shortest_distance(g=gtG['G'], source=None, target=None, dense=False)
total_distance = 0.
for u in gtG['G'].vertices():
from_u = gtDistances[u].a
total_distance += sum(from_u)
nn = gtG['G'].num_vertices()
gt_mean_distance = total_distance / (nn * nn)
return gt_mean_distance
def average_all_pairs_shortest_path_estimate(G, max_num_sources=100):
if nx.number_connected_components(G)>1:
return METRIC_ERROR
num_sources = min(G.number_of_nodes(), max_num_sources)
baseline_sources = []
if hasattr(G, '_musketeer_data'):
if 'all_pairs_shortest_path_estimate_SOURCES' in G._musketeer_data:
baseline_sources = G._musketeer_data['all_pairs_shortest_path_estimate_SOURCES']
baseline_sources = filter(lambda s: s in G, baseline_sources)
else:
G._musketeer_data = {}
sampled_sources = baseline_sources + random.sample(G.nodes(), num_sources - len(baseline_sources))
G._musketeer_data['all_pairs_shortest_path_estimate_SOURCES'] = sampled_sources
total_distance = 0.0
for source in sampled_sources:
lengths = nx.single_source_shortest_path_length(G, source)
total_distance += np.average(lengths.values())
total_distance /= num_sources
return total_distance
def average_all_pairs_shortest_path_estimate_GT(G, max_num_sources=100):
if nx.number_connected_components(G)>1:
return METRIC_ERROR
import graph_tool, graph_tool.topology
gtG = make_gt_graph(G)
num_sources = min(G.number_of_nodes(), max_num_sources)
cum_distance = 0.0
for source in random.sample(G.nodes(), num_sources):
gt_source = gtG['G'].vertex(gtG['node_to_num'][source])
gtDistances = graph_tool.topology.shortest_distance(g=gtG['G'], source=gt_source, target=None, dense=False)
cum_distance += np.average(gtDistances.a)
cum_distance /= num_sources
return cum_distance
def average_all_pairs_inverse_shortest_path_estimate(G, max_num_sources=100):
#estimates the ''efficiency'' of the graph: the harmonic mean of the distances
#this is well-defined even in disconnected graphs
if G.number_of_edges()<1:
return METRIC_ERROR
num_sources = min(G.number_of_nodes(), max_num_sources)
baseline_sources = []
if hasattr(G, '_musketeer_data'):
if 'all_pairs_inverse_shortest_path_estimate_SOURCES' in G._musketeer_data:
baseline_sources = G._musketeer_data['all_pairs_inverse_shortest_path_estimate_SOURCES']
baseline_sources = filter(lambda s: s in G, baseline_sources)
else:
G._musketeer_data = {}
sampled_sources = baseline_sources + random.sample(G.nodes(), num_sources - len(baseline_sources))
G._musketeer_data['all_pairs_inverse_shortest_path_estimate_SOURCES'] = sampled_sources
tally = 0.0
for source in sampled_sources:
lengths = nx.single_source_shortest_path_length(G, source)
tally += sum([1.0/lengths[node] for node in lengths if node!=source])
tally = num_sources*(1.0/tally)
return tally
def average_flow_closeness(G):
if nx.number_connected_components(G)>1:
return METRIC_ERROR
else:
length=nx.algorithms.current_flow_closeness_centrality(G)
sum = 0.0
count = 0.0
for key1 in length.keys():
sum = sum + length[key1]
count = count + 1
return sum/count
def average_eigenvector_centrality(G):
#warning: this algorithm might not be suitable for disconnected graphs, since it creates additional zero eigenvalues
if nx.number_connected_components(G)>1:
return METRIC_ERROR
else:
length=nx.algorithms.eigenvector_centrality(G, 500, 0.0001)
sum = 0.0
count = 0.0
for key1 in length.keys():
sum = sum + length[key1]
count = count + 1
return sum/count
def algebraic_distance_dense(G, params={}):
'''
takes: graph G, computational parameters
returns:
a distance dictionary, d[node1][node2] giving the distance between the nodes
ref:
RELAXATION-BASED COARSENING AND
MULTISCALE GRAPH ORGANIZATION
DORIT RON, ILYA SAFRO, AND ACHI BRANDT
'''
metric = params.get('metric', 'Linfinity')
num_relaxations_r = params.get('num_relaxations_r', 10)
num_test_vectors_K = params.get('num_test_vectors_K', 20)
lazy_walk_param_w = params.get('lazy_walk_param_w', 0.3)
if metric != 'Linfinity':
raise Exception('Metric other than Linifinity not implemented')
#singletons = filter(lambda u: G.degree(u)==0, G.nodes())
#if len(singletons) == 0:
# Gcopy = G
#else:
# Gcopy = G.copy()
# Gcopy.remove_nodes_from(singlestons)
#H = nx.convert_node_labels_to_integers(Gcopy, label_attribute='orig_label')
H = nx.convert_node_labels_to_integers(G, label_attribute='orig_label')
#for singletons, we set diag=1
all_nodes = H.nodes() #future: use this for sorting
distance = {}
for node1 in all_nodes:
distance[node1] = {}
for node2 in H.neighbors(node1):
if node1 < node2: #save time
distance[node1][node2] = -np.inf
#wishlist: sparse matrices
#the wrong laplacian (uses degrees) LAP = nx.laplacian_matrix(H)
#should consider degree? diag_vec = np.array([H.node[u].get('weight', 1) for u in H])
#diag_vec = np.array([H.degree(u) for u in H])
diag_vec = []
for u in all_nodes:
val = 0. + sum(edge.get('weight', 1.) for edge in H.edge[u].values())
if val == 0.:
val == 1.
diag_vec.append(val)
diag_vec = np.array(diag_vec)
diag_vec_inv = 1./diag_vec #[1./val for val in diag_vec]
DIAG = np.diag(diag_vec)
DIAGinv = np.diag(diag_vec_inv)
ADJ = nx.adj_matrix(G, nodelist=all_nodes)
w_times_Dinv_times_LAP = lazy_walk_param_w * np.dot(DIAGinv,DIAG-ADJ)
for t in xrange(num_test_vectors_K):
x = npr.rand(H.number_of_nodes(),1) - 0.5
x = x / x.sum()
for iteration in xrange(num_relaxations_r):
x = (1-lazy_walk_param_w)*x + np.dot(w_times_Dinv_times_LAP, x)
#maximize over the trial vectors: d(i,j) = max_{t=1..K} |x_t(i) - x_t(j)|
for node1 in all_nodes:
for node2 in H.neighbors(node1):
dis = np.abs((x[node1]-x[node2])[0,0])
if node1 < node2 and dis > distance[node1][node2]: #to save time, compute just the upper triangle of the matrix
distance[node1][node2] = dis
#generate the distance dictionary in the original node labels, and including the diagonal and lower triangle
ret = {}
for node1 in G:
ret[node1] = {node1:0.}
for u in H:
node1 = H.node[u]['orig_label']
ret[node1] = {}
for v in H.neighbors(u):
node2 = H.node[v]['orig_label']
if u < v:
d = distance[u][v]
elif v < u:
d = distance[v][u]
else:
d = 0.
ret[node1][node2] = d
ret[node2][node1] = d
return ret
def algebraic_distance_sparse(G, params={}):
'''
takes: graph G, computational parameters
returns:
a distance dictionary, d[node1][node2] giving the distance between the nodes
ref:
RELAXATION-BASED COARSENING AND
MULTISCALE GRAPH ORGANIZATION
DORIT RON, ILYA SAFRO, AND ACHI BRANDT
'''
metric = params.get('metric', 'Linfinity')
num_relaxations_r = params.get('num_relaxations_r', 10)
num_test_vectors_K = params.get('num_test_vectors_K', 50)
lazy_walk_param_w = params.get('lazy_walk_param_w', 0.5)
singletons = filter(lambda u: G.degree(u)==0, G.nodes())
if len(singletons) == 0:
Gcopy = G
else:
Gcopy = G.copy()
Gcopy.remove_nodes_from(singlestons)
H = nx.convert_node_labels_to_integers(Gcopy, label_attribute='orig_label') #this does preserve node and edge weights
if metric != 'Linfinity':
raise Exception('Metric other than Linifinity is not implemented')
distance = {}
for node1 in H:
distance[node1] = {}
for node2 in H:
if node1 < node2: #save time
distance[node1][node2] = -np.inf
nn = H.number_of_nodes()
#wishlist: sparse matrices
##LAP = nx.laplacian_matrix(H)
#diag_vec = [H.node['weight'] for node in H]
#diag_vec = np.array([H.node[u].get('weight', 1) for u in H]) #should consider degree?
diag_vec = np.array([H.degree(u) for u in H])
diag_vec_inv = 1./diag_vec #[1./val for val in diag_vec]
full_range = range(nn)
DIAG = scipy.sparse.csr_matrix((diag_vec, (full_range,full_range)))
DIAGinv = scipy.sparse.csr_matrix((diag_vec_inv, (full_range,full_range)))
ADJ = nx.to_scipy_sparse_matrix(G, nodelist=non_singletons, format='csr')
#w_times_Dinv_times_LAP = lazy_walk_param_w * np.dot(np.diag([1./el for el in diag_vec]),DIAG-LAP)
w_times_Dinv_times_LAP = lazy_walk_param_w * DIAGinv.dot(DIAG-ADJ)
for t in xrange(num_test_vectors_K):
x = npr.rand(H.number_of_nodes(),1)
for iteration in xrange(num_relaxations_r):
x = (1-lazy_walk_param_w)*x + w_times_Dinv_times_LAP.dot(x)
#maximize over the trial vectors: d(i,j) = max_{t=1..K} |x_t(i) - x_t(j)|
for node1 in H:
for node2 in (H if all_pairs_distance else H.neighbors(node1)):
dis = np.abs((x[node1]-x[node2])[0,0])
if node1 < node2:
old_dis = distance[node1][node2]
if dis > old_dis: #to save time, compute just the upper triangle of the matrix
distance[node1][node2] = dis
#generate the distance dictionary in the original node labels, and including the diagonal and lower triangle
ret = {}
for node1 in G:
ret[node1] = {node1:0.} #important for singletons
for u in H:
node1 = H.node[u]['orig_label']
ret[node1] = {node1:0.}
for v in (H if all_pairs_distance else H.neighbors(u)):
node2 = H.node[v]['orig_label']
if u < v:
d = distance[u][v]
elif v < u:
d = distance[v][u]
else:
d = 0.
ret[node1][node2] = d
ret[node2][node1] = d
return ret
algebraic_distance = algebraic_distance_dense
#algebraic_distance = algebraic_distance_sparse
def bfs_distance_with_horizon(G, source, horizon=4, blocked_node=None):
#computes distance from every node to every neighbor at distance at most <horizon> hops
# nodes further away are considered infinitely away
#no path is allowed through blocked_node
G_adj = G.adj
G_neighbors = lambda u: G_adj[u].keys()
fringe = set(G_neighbors(source))
distance_source = {source:0}
for d in xrange(1, horizon+1):
new_fringe = []
for v in fringe:
if v not in distance_source and v!=blocked_node:
distance_source[v] = d
new_fringe += G_neighbors(v)
fringe = set(new_fringe)
return distance_source
def color_by_3d_distances(G, verbose):
import matplotlib.pylab as pylab
#cm=pylab.get_cmap('Paired')
#cm=pylab.get_cmap('gist_rainbow')
cm=pylab.get_cmap('RdBu') #UFL
if verbose:
print 'Computing edge colors ...'
max_dis = 0
positions = {}
for u,v,data in G.edges_iter(data=True):
try:
u_pos = positions[u]
except:
#u_pos = np.array([float(p) for p in G.node[u]['pos'][1:-1].split(',')])
u_pos = np.array([float(p) for p in G.node[u]['pos'].split(',')])
positions[u] = u_pos
try:
v_pos = positions[v]
except:
#v_pos = np.array([float(p) for p in G.node[v]['pos'][1:-1].split(',')])
v_pos = np.array([float(p) for p in G.node[v]['pos'].split(',')])
positions[v] = v_pos
dis = np.sqrt(np.sum(np.power(u_pos-v_pos,2)))
max_dis = max(max_dis, dis)
data['dis'] = dis
for u,v,data in G.edges_iter(data=True):
dis = data.pop('dis')
#data['color'] = '"%.3f %.3f %.3f"'%tuple(cm(dis/max_dis)[:3])
data['color'] = '%.3f %.3f %.3f'%tuple(cm(dis/max_dis)[:3])
#data['weight'] = 1.0
return G
def color_new_nodes_and_edges(G, original, params=None):
#add red color to new components.
#use the option 'post_processor':graphutils.color_new_nodes_and_edges
for node in G:
G.node[node]['label'] = ''
#d['style'] = 'filled'
if node in original:
G.node[node]['color']='black'
else:
G.node[node]['color']='blue'
for u,v,d in G.edges_iter(data=True):
if original.has_edge(u,v):
d['color']='black'
else:
d['color']='blue'
return G
def compare_nets(old_G, new_G, metrics=None, params={}):
'''
Report on the differences between two networks
'''
if metrics == None:
metrics = default_metrics
verbose = params.get('verbose', True)
runningtime_bound = params.get('metric_runningtime_bound', 2)
precision = params.get('reporting_precision', 2)
#formatstring = '\t%.'+str(precision)+'f\t%.'+str(precision)+'f\t%.'+str(precision)+'f%%'
#TODO: at the moment, graph_graph_delta cannot find edges which were deleted then inserted back: it changes the edge attribute data
errors = {}
if verbose:
delta = graph_graph_delta(old_G, new_G)
num_changed_nodes = len(delta['new_nodes']) + len(delta['del_nodes'])
num_changed_edges = len(delta['new_edges']) + len(delta['del_edges'])
if old_G.number_of_nodes() > 0:
print 'New or deleted Nodes: %d (%.1f%%)'%(num_changed_nodes, 100*float(num_changed_nodes)/old_G.number_of_nodes())
print 'New or deleted Edges: %d (%.1f%%)'%(num_changed_edges, 100*float(num_changed_edges)/old_G.number_of_edges())
print
print 'Name\t\t\tOld G\t\tNew G\t\tRelative Error'
print 'statistics start ------------------------------------------------------------'
for met_info in metrics:
met_name = met_info['name']
met_func = met_info['function']
met_wt = met_info['weight']
if met_info['optional'] > 0 or met_info['runningtime'] > runningtime_bound:
continue
try:
if verbose:
sys.stdout.write(met_name.center(20))
sys.stdout.flush()
old_value = met_func(old_G)
new_value = met_func(new_G)
if old_value != 0. and abs(old_value-METRIC_ERROR) > 1 and abs(new_value-METRIC_ERROR) > 1:
error = met_wt*float(new_value-old_value)/old_value
else:
error = np.NaN
if verbose:
outstr = ''
if np.abs(old_value) < 0.1 or np.abs(old_value) > 1000:
outstr += ('\t%.'+str(precision)+'e')%old_value
else:
outstr += ('\t%.'+str(precision)+'f ')%old_value
if np.abs(new_value) < 0.1 or np.abs(new_value) > 1000:
outstr += ('\t%.'+str(precision)+'e')%new_value
else:
outstr += ('\t%.'+str(precision)+'f ')%new_value
outstr += ('\t%.'+str(precision)+'f%%')%(100*error)
print outstr
#print formatstring%(old_value,new_value,100*error)
errors[met_name] = (old_value, new_value, error)
except Exception,inst:
print
print 'Warning: could not compute '+met_name + ': '+str(inst)
mean_error = np.average([np.abs(v[2]) for v in errors.values() if (v[2]!=np.NaN) and abs(v[2]-METRIC_ERROR) > 1])
if verbose:
print 'statistics end ------------------------------------------------------------'
print 'Mean absolute difference: %.2f%%'%(100*mean_error)
return mean_error, errors
def degree_assortativity(G):
#this wrapper helps avoid error due to change in interface name
if hasattr(nx, 'degree_assortativity_coefficient'):
return nx.degree_assortativity_coefficient(G)
elif hasattr(nx, 'degree_assortativity'):
return nx.degree_assortativity(G)
else:
raise ValueError, 'Cannot compute degree assortativity: method not available'
def drake_hougardy_slow(G):
#uses an implementation close to the pseudo-code in the paper
assert not G.is_directed()
H = nx.Graph()
for u,v,d in G.edges(data=True):
H.add_edge(u,v,d)
#H = G.copy() #deep
H_adj = H.adj
H_degree = lambda u: H_adj[u].__len__()
H_outedges = lambda u: H.edge[u]
Matchings = ([],[])
Weights = [0.,0.]
ind = 0
#edges = set(G.edges())
#inspected_nodes = dict.from_keys(G.nodes(), False)
ni = G.nodes_iter() #use G, not H
#qx = ni.next()
while H.number_of_edges() > 0:
try:
x = ni.next()
except StopIteration:
break
while x in H and H_degree(x) > 0:
nbs = H_outedges(x)
nb_weights = [(nb,nbs[nb].get('weight', 1.0)) for nb in nbs]
y,wt_y = max(nb_weights, key=lambda x:x[1])
Matchings[ind].append((x,y))
Weights[ind] += wt_y
H.remove_node(x)
x = y
ind = (ind + 1)%2
if Weights[0] > Weights[1]:
return dict(Matchings[0] + [(y,x) for (x,y) in Matchings[0]])
else:
return dict(Matchings[1] + [(y,x) for (x,y) in Matchings[1]])
def drake_hougardy(G, maximize=True):
'''Compute a weighted matching of G using the Drake-Hougardy path growing algorithm.[1]
The matching is guaranteed to have weight >= 0.5 of the maximumal weight matching
Parameters
----------
G: NetworkX undirected graph
maximize: add an additional step to find edges missed by the matching, to return a maximal matching [2]
Returns
-------
mate : dictionary
The matching is returned as a dictionary, mate, such that
mate[v] == w if node v is matched to node w. Unmatched nodes do not occur as a key in mate.
for convenience, iff mate[v] == w then mate[w] == v
References
----------
.. [1] "A simple approximation algorithm for the weighted matching problem"
Doratha E. Drake, Stefan Hougardy. Information Processing Letters, 2002.
.. [2] "Linear time local improvements for weighted matchings in graphs"
Doratha E. Drake, Stefan Hougardy. Report.
'''
assert not G.is_directed()
G_adj = G.adj
G_outedges = lambda u: G.edge[u]
mx = max
Matchings = ([],[])
Weights = [0.,0.]
ind = 0
ni = G.nodes_iter() #use G, not G
inspected = set() #iff u in inspected, it has been already included in the matching.
num_inspected_halfedges = 0
num_edges = G.number_of_edges()
nodes = G.nodes()
try:
npr.shuffle(nodes) #wishlist: why does it fail on npr.shuffle([u'903', 1]) ??
except:
random.shuffle(nodes) #wishlist: might be too slow...
for x in nodes:
if num_inspected_halfedges >= 2*num_edges:
break
while x not in inspected:
inspected.add(x)
nbs = G_outedges(x)
num_inspected_halfedges += nbs.__len__()
nb_weights = [(nb,nbs[nb].get('weight', 1.0)) for nb in nbs if nb not in inspected]
if nb_weights.__len__() == 0:
continue
y,wt_y = mx(nb_weights, key=lambda pair:pair[1])
Matchings[ind].append((x,y))
Weights[ind] += wt_y
x = y
ind = (ind + 1)%2
if Weights[0] > Weights[1]:
best_matching = dict(Matchings[0] + [(y,x) for (x,y) in Matchings[0]])
else:
best_matching = dict(Matchings[1] + [(y,x) for (x,y) in Matchings[1]])
if not maximize:
return best_matching
for x in G:
if x in best_matching:
continue
nbs = G_outedges(x)
nb_weights = [(nb,nbs[nb].get('weight', 1.0)) for nb in nbs if nb not in best_matching]
if nb_weights.__len__() == 0:
continue
y,wt_y = mx(nb_weights, key=lambda pair:pair[1])
best_matching[x] = y
best_matching[y] = x
return best_matching
def graph_graph_delta(G, new_G, **kwargs):
#lists the changes in the two graphs, and reports the Jaccard similarity coefficient for nodes and for edges
new_nodes = []
del_nodes = []
new_edges = []
del_edges = []
for node in G:
if node not in new_G:
del_nodes.append(node)
for edge in G.edges():
if not new_G.has_edge(*edge):
del_edges.append(edge)
for node in new_G:
if node not in G:
new_nodes.append(node)
for edge in new_G.edges():
if not G.has_edge(*edge):
new_edges.append(edge)
ret = {'new_nodes':new_nodes, 'del_nodes':del_nodes, 'new_edges':new_edges, 'del_edges':del_edges}
num_nodes_original = G.number_of_nodes()
num_edges_original = G.number_of_edges()
if num_nodes_original + len(new_nodes) > 0:
jaccard_nodes = float(num_nodes_original-len(del_nodes))/(num_nodes_original + len(new_nodes))
else:
jaccard_nodes = 0.
if num_edges_original + len(new_edges) > 0:
jaccard_edges = float(num_edges_original-len(del_edges))/(num_edges_original + len(new_edges))
else:
jaccard_edges = 0.
ret['jaccard_nodes'] = jaccard_nodes
ret['jaccard_edges'] = jaccard_edges
return ret
def graph_sanity_test(G, params=None):
ok = True
if G.number_of_nodes() == 0:
print 'Warning: no nodes'
ok = False
elif G.has_node(None):
print 'Node with label "None" is in the graph.'
ok = False
elif G.number_of_edges() == 0:
print 'Warning: no edges'
ok = False
elif G.is_directed():
print 'Warning: the algorithm DOES NOT support directed graphs for now'
ok = False
if ok:
selfloops = G.selfloop_edges()
if selfloops != []:
print 'Warning: self-loops detected (%d)'%len(selfloops)
print 'Deleting!'
G.remove_edges_from(selfloops)
ok = False
return ok
def load_graph(path, params={}, list_types_and_exit=False):
'''reads graph from path, using automatic detection of graph type
to attempt AUTODETECTION use params['graph_type'] = AUTODETECT
'''
loaders = {
'adjlist':nx.read_adjlist,
'adjlist_implicit':read_adjlist_implicit,
'adjlist_implicit_prefix':read_adjlist_implicit_prefix,
'graph6':nx.read_graph6,
'shp':nx.read_shp,
'dot':nx.read_dot,
'xdot':nx.read_dot,
'graph6_list':nx.read_graph6_list,
'sparse6':nx.read_sparse6,
'edges':nx.read_edgelist,
'elist':nx.read_edgelist,
'edgelist':nx.read_edgelist,
'graphml':nx.read_graphml,
'sparse6_list':nx.read_sparse6_list,
'gexf':nx.read_gexf,
'leda':nx.read_leda,
'weighted_edgelist':nx.read_weighted_edgelist,
'gml':nx.read_gml,
'multiline_adjlist':nx.read_multiline_adjlist,
'yaml':nx.read_yaml,
'gpickle':nx.read_gpickle,
'pajek':nx.read_pajek,}
raw_loaders = {
'adjlist':nx.parse_adjlist,
'elist':nx.parse_edgelist,
'edgelist':nx.parse_edgelist,
'gml':nx.parse_gml,
'leda':nx.parse_leda,
'multiline_adjlist':nx.parse_multiline_adjlist,
'pajek':nx.parse_pajek,
}
known_extensions = {
'gml':'gml',
'dot':'dot',
'xdot':'dot',
'edges':'edgelist',
'elist':'edgelist',
'edgelist':'edgelist',
'welist':'weighted_edgelist',
'wdgelist':'weighted_edgelist',
'weighted_edgelist':'weighted_edgelist',
'adj':'adjlist',
'alist':'adjlist',
'adjlist':'adjlist',
'adjlistImp':'adjlist_implicit',
'adjlistImpPre':'adjlist_implicit_prefix',
'pajek':'pajek',
'net':'pajek',
}
if list_types_and_exit:
return loaders.keys()
def sane_graph(G, params={}):
if G.number_of_nodes() > 0:
return True
else:
return False
G = None
graph_type = params.get('graph_type', 'AUTODETECT')
read_params= params.get('read_params', {})
skip_sanity= params.get('skip_sanity', False)
if not os.path.exists(path):
raise ValueError, 'Path does not exist: %s'%path
if graph_type in loaders:
if graph_type in ['edges', 'elist', 'edgelist']:
print "Default weight is 1. To indicate weight, each line should use the format: node1 node2 {'weight':positive_wt}"
if graph_type in ['adjlist']:
print 'Adjlist format: WARNING! assuming that the lines are: "u neighbor1 neighbor2 etc". Implicit "u" is not allowed'
try:
G = loaders[graph_type](path=path, **read_params)
if not sane_graph(G) and not skip_sanity:
print 'Warning: Sanity test failed!'
print
graph_type = None
except:
print 'Graph read error.'
raise
if G == None and graph_type != 'AUTODETECT':
raise Exception,'Unable to load graphs of type '+str(graph_type)
extension_guess = os.path.splitext(path)[1][1:]
if G == None and extension_guess in known_extensions:
print 'Attempting auto-detection of graph type.'
if params.get('verbose', True):
print 'Warning: Trying to auto-detect graph type by extension'
graph_type = known_extensions[extension_guess]
if params.get('verbose', True):
print 'Guessing type: '+str(graph_type)
try:
G = loaders[graph_type](path=path)
assert sane_graph(G) or skip_sanity
except Exception, inst:
print 'Graph read error. This might be caused by malformed edge data or unicode errors.'
print inst
if G == None and graph_type in raw_loaders:
if params.get('verbose', True):
print 'Trying raw read...'
try:
f = open(path, 'rb')
lines = f.readlines()
G = raw_loaders[graph_type](lines=lines)
del lines
assert sane_graph(G) or skip_sanity
except Exception, inst:
print 'Graph read error:'
print inst
finally:
try:
f.close()
except:
pass
if G == None:
if params.get('verbose', True):
print 'Warning: Trying to guess graph type iteratively: this often FAILS'
for graph_type in loaders:
try:
if params.get('verbose', True):
sys.stdout.write(graph_type + '? ')
G = loaders[graph_type](path=path)
if sane_graph(G) or skip_sanity:
if params.get('verbose', True):
print(' Yes!')
print 'Successfully detected type: '+str(graph_type)
break
else:
if params.get('verbose', True):
print(' No.')
G = None
#wishlist: attempt edgelist before adjlist
except:
if params.get('verbose', True):
print(' No.')
if G == None:
raise Exception, 'Could not load graph. None of the available loaders succeeded.'
#postprocessing
if graph_type == 'dot':
G.name = os.path.split(path)[1] #otherwise the output is terrible
try:
if not hasattr(G, 'name') or G.name == '':
G.name = os.path.split(path)[1]
except:
pass
return G
def make_gt_graph(nxG):
import graph_tool, graph_tool.topology
node_to_num = dict(zip(nxG.nodes(), range(nxG.number_of_nodes())))
gtG = graph_tool.Graph(directed=nxG.is_directed())
edge_weights = gtG.new_edge_property("double")
gtG.add_vertex(nxG.number_of_nodes())
for u,v,data in nxG.edges(data=True):
e = gtG.add_edge(node_to_num[u], node_to_num[v])
edge_weights[e] = data.get('weight', 1.0)
return {'G':gtG, 'edge_weights':edge_weights, 'node_to_num':node_to_num}
def powerlaw_mle(G, xmin=6.):
#estimate the power law exponent based on Clauset et al., http://arxiv.org/abs/0706.1062,
#for simplicity, we avoid the MLE calculation of Eq. (3.5) and instead use the approximation of Eq. (3.7)
#the power law is only applied for nodes of degree > xmin, so it's not suitable for others
degseq = G.degree().values()
#print np.array(degseq).transpose()
if xmin < 6:
print 'Warning: the estimator uses an approximation which is not suitable for xmin < 6'
degseqLn = [np.log(xi/(xmin-0.5)) for xi in degseq if xi >= xmin]
degseqLn.sort() #to reduce underflow.
#print degseqLn
return 1. + len(degseqLn) / sum(degseqLn)
def read_adjlist_implicit(path, comments = '#', delimiter = None,
create_using = None, nodetype = int):
"""Parse lines of a graph adjacency list representation.
Parameters
----------
lines : list or iterator of strings
Input data in adjlist format
The line number is IMPLICIT, starting with line 1
create_using: NetworkX graph container
Use given NetworkX graph for holding nodes or edges.
nodetype : Python type, optional
Convert nodes to this type.
comments : string, optional
Marker for comment lines
delimiter : string, optional
Separator for node labels. The default is whitespace.
create_using: NetworkX graph container
Use given NetworkX graph for holding nodes or edges.
Returns
-------
G: NetworkX graph
The graph corresponding to the lines in adjacency list format.
Examples
--------
>>> lines = ['2 5',
... '3 4',
... '5',
... '',
... '']
>>> G = nx.parse_adjlist(lines, nodetype = int)
>>> G.nodes()
[(1, 2), (1, 5), (2, 3), (2, 4), (3, 5)]
See Also
--------
read_adjlist
"""
if create_using is None:
G=nx.Graph()
else:
try:
G=create_using
G.clear()
except:
raise TypeError("Input graph is not a NetworkX graph type")
with open(path, 'r') as file:
lines = file.readlines()
linenum = 1
for line in lines:
p=line.find(comments)
if p>=0:
line = line[:p]
if not len(line):
continue
vlist=line.strip().split(delimiter)
#u=vlist.pop(0)
# convert types
u = linenum
if nodetype is not None:
try:
u=nodetype(linenum)
except:
raise TypeError("Failed to convert node (%s) to type %s"\
%(u,nodetype))
G.add_node(u)
if nodetype is not None:
try:
vlist=map(nodetype,vlist)
except:
raise TypeError("Failed to convert nodes (%s) to type %s"\
%(','.join(vlist),nodetype))
G.add_edges_from([(u, v) for v in vlist])
linenum += 1
return G
def read_adjlist_implicit_prefix(path, comments = '#', create_using=None):
'''
reads network files formatted as:
"
15606 45878
2 3 6 7
1 4 6 9
"
and so on.
first line: num_nodes num_edges
second lines (and the rest of the lines in the file):
[implicit node = line number - 1] neighbor1 neighbor2 ...
empty lines are degree=0 nodes
'''
if create_using == None:
G = nx.Graph()
else:
G = create_using()
try:
with open(path, 'r') as file_handle:
header_data = file_handle.next().split(' ')
node_num = 1
for line in file_handle:
p=line.find(comments)
if p>=0:
line = line[:p]
if not len(line):
continue
line = line.strip()
if line == '':
G.add_node(node_num)
else:
G.add_edges_from([(node_num,int(v)) for v in line.split(' ')])
node_num += 1
except Exception,inst:
if 'node_num' not in locals():
raise