/
UIDalgorithm.py
249 lines (228 loc) · 10.9 KB
/
UIDalgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
def emcomps(n,k):
"""Returns the email probability for each composition of n emails among k people in a list"""
if n < 0 or k < 0:
return
elif k == 0:
# the empty sum, by convention, is zero, so only return something if
# n is zero
if n == 0:
yield []
return
elif k == 1:
yield emprob(n) #emprob is the probability that a person has n emails
return
else:
for i in range(0,n+1):
for comp in emcomps(n-i,k-1):
yield emprob(i)*comp
def cccomps(n,k):
"""Returns the card probability for each composition of n credit cards among k people in a list"""
if n < 0 or k < 0:
return
elif k == 0:
# the empty sum, by convention, is zero, so only return something if
# n is zero
if n == 0:
yield []
return
elif k == 1:
yield ccprob(n) #ccprob is the probability that a person has n credit cards
return
else:
for i in range(0,n+1):
for comp in cccomps(n-i,k-1):
yield ccprob(i)*comp
def emprob(n):
"""This returns the probability of one person having n email accounts
according to some data on the internet"""
#This numbers could and should be changed depending on the data
if n == 0:
return 0 #This is the probability that a person has 0 emails
if n == 1:
return 0.2417+.0668 #This is the probability that a person has 1 emails
if n == 2:
return 0.3829 #This is the probability that a person has 2 emails
if n == 3:
return 0.2417 #This is the probability that a person has 3 emails
if n == 4:
return 0.0606 #This is the probability that a person has 4 emails
if n >= 5:
return 0.0062 #This is the probability that a person has 5 emails
def ccprob(n):
"""This returns the probability of one person having n credit cards
according to some data on the internet"""
#This numbers could and should be changed depending on the data
if n == 0:
return 0.18 #This is the probability that a person has 0 credit/debit cards
if n == 1 or n == 2:
return 0.48 #This is the probability that a person has 1 or 2 credit/debit cards
if n == 3 or n == 4:
return 0.18 #This is the probability that a person has 3 or 4 credit/debit cards
if n == 5 or n == 6:
return 0.09 #This is the probability that a person has 5 or 6 credit/debit cards
if n >= 7:
return 0.07 #This is the probability that a person has 7+ credit/debit cards
def feas2(em,cc,log):
"""Returns the most feasible number of people associated with em emails
and cc credit cards"""
problist = [] # initialize the list of probabilities
for i in range(1,em+log+1): # i is the number of people we're splitting them among
emlist_i = list(emcomps(em+log,i)) # This gives the email probability for each composition into i parts in a list.
ccprob_i = sum(list(cccomps(cc,i))) # This gives the SUM of the card probabilities for all the compositions into i parts.
totprob_i = sum(ccprob_i*np.array(emlist_i)) # This multiplies each email probability by the sum of the card probabilities, then sums it all up
problist.append(totprob_i) # Throw it in the list
if i>1:
if problist[0]<sum(problist[2:]):
return 1
return 0 # plus 1 to get the actual number of people (since it starts at zero)
def GraphFeas(G):
"""Uses the feas2 function to compute the most probably number of people represented in the graph.
This is based on the number of emails, logins, and credit cards."""
#Count the number of 'email' nodes in the graph G
email_nodes=[n for n in G.nodes() if G.node[n]['type']=='email']
emails=len(email_nodes)
#Count the 'creditcard' nodes in the graph G
cc_nodes=[n for n in G.nodes() if G.node[n]['type']=='creditcard']
ccs=len(cc_nodes)
#Count the 'login' nodes in the graph G
login_nodes=[n for n in G.nodes() if G.node[n]['type']=='login']
logins=len(login_nodes)
return feas2(emails,ccs,logins)
def edges_to_merge(G):
"""This function decides how the graph G should be merged before cutting.
It does this by looking at each cookie and credit card node and merging it
with the email or login node it is connected to with the highest weight.
If there is a tie is grabs the first in the list of its neighbors that are
an email or a login."""
cook_cc_nodes=[]
merge_edges=[]
#First grab all of the cookie or credit card nodes
for n0 in G.nodes():
if G.node[n0]['type']=='cookie' or G.node[n0]['type']=='creditcard':
cook_cc_nodes=cook_cc_nodes+[n0]
#Now we look at the neighbors that are a login or email node of each cookie/credit card node.
#Then grab the edge with the highest weight.
for N in cook_cc_nodes:
L=list(nx.all_neighbors(G,N))
P=[]
#This will make P a list of the neighbors that are logins or emails
for n1 in L:
if G.node[n1]['type']=='login' or G.node[n1]['type']=='email':
P=P+[n1]
#Grab and store, in a list W, the edge data from original node and its neighbors from P
W=[]
for n2 in P:
b=G.get_edge_data(N,n2)
W=W+[b]
#w will be the weight of the edges that have been stored in W
w=[]
for i in range(0,len(W)):
w=w+[W[i]['weight']]
i=i+1
m=max(w)
#enumerate w and store the exact position the maximum weight occurs
#position=[k for k, j in enumerate(w) if j == m]
position=[]
for k,j in enumerate(w):
if j==m:
position=position+[k]
#grab the node from P that is in the position stored in position. This is the node it should merge with.
#maxnhbr=[(P[pos]) for pos in position]
maxnhbr=[P[position[0]]]
#Now store the edges the need to be merged
for n3 in maxnhbr:
merge_edges=merge_edges+[(N,n3)]
return merge_edges
def merge(G,edge):
"""Returns a new graph with edge merged, and the new node containing the
information lost in the merge. The weights from common neighbors
are added together, a la Stoer-Wagner algorithm."""
if G.node[edge[1]]['type'] == 'login' or G.node[edge[1]]['type'] == 'email':
edge = edge[::-1] # If login/email is on the right, flip the order, so it's always on the left
nx.set_node_attributes(G,'list',{edge[0]:G.node[edge[0]]['list']+[edge[1]]})
J = nx.contracted_edge(G,(edge[0],edge[1]),self_loops = False) #Contract edge without self-loop
# Weight stuff
N = nx.common_neighbors(G,edge[0],edge[1]) #find common nodes
for i in N:
J[i][edge[0]]['weight'] = G[edge[0]][i]['weight'] + G[edge[1]][i]['weight'] #modify the weight after contraction
return J
def drawnice(G):
"""Much better than nx.draw"""
posG=nx.spring_layout(G,k=0.7)
labelsG = nx.get_edge_attributes(G,'weight')
nx.draw_networkx_nodes(G,posG,node_size=800)
nx.draw_networkx_labels(G,posG,font_size=8,font_family='sans-serif')
nx.draw_networkx_edges(G,posG,width=1)
nx.draw_networkx_edge_labels(G,posG,edge_labels=labelsG,fontsize='1')
def target_cut(G):
"""Returns a list of lists each of which contain nodes that correspond to unique people represented
in the graph G."""
#First separate the graph into its connected components
graphs = list(nx.connected_component_subgraphs(G))
after_cut = []
#For each connected component we decide whether or the graph should be cut, based
#on our feas2 function. Then the component is merged and finally cut.
for sub_G in graphs:
#If this returns 1 or True then the component should be cut
if_cut = GraphFeas(sub_G)
#In the case the component is not cut then store the nodes associated
#to that component and move on.
if if_cut == 0:
after_cut = after_cut + [sub_G.nodes()]
#Other wise we merge and cut the component
else:
#First we look at all the edges that should be merged via the
#edges_to_merge function
edges = edges_to_merge(sub_G)
A = sub_G
#Then the edges are merged via the merge function
for Eg in edges:
A = merge(A,Eg)
#After all of the merging is done the graph is cut along a minimum
#cut into two connected components via stoer_wagner
cut_val, cuts = nx.stoer_wagner(A)
#This is a small note that would take the weights into consideration
#before cutting. Thus eliminating possible false cuts on high weights.
# if cut_val>=some reasonably big number:
# after_cut=after_cut+A.nodes()
# else:
# Indent the following code to be in this else statement
nodelist1 = []
#The nodes for each component after cutting are stored
for nd in cuts[0]:
#To get the nodes in these components we must look "inside" at the
#list attribute of the merged nodes
nodelist1 = nodelist1 + A.node[nd]['list']
nodelist1 = nodelist1 + [nd]
nodelist2 = []
#Do the samething for the second component
for nd in cuts[1]:
nodelist2 = nodelist2 + A.node[nd]['list']
nodelist2 = nodelist2 + [nd]
#Turn the node lists into subgraphs so the target_cut can be
#reinitiated on them until they should not be cut anymore.
sub1 = sub_G.subgraph(nodelist1)
sub2 = sub_G.subgraph(nodelist2)
after_cut = after_cut + target_cut(sub1)
after_cut = after_cut + target_cut(sub2)
return after_cut
########################################################################################################
##################### THE ABOVE FUNCTIONS ARE USED IN CODE BELOW ############################
def graph_construction(G):
"""Returns a list of subgraphs of G found by the target_cut function."""
#Run target_cut on the graph G to generate a list of lists which contains the nodes that should
#coorespond to subgraphs of G that represent unique people.
A = target_cut(G)
graph_list = []
#From the list of nodes in A we recreate the subgraphs in G that contain these nodes
for lst in A:
graph_list = graph_list + [G.subgraph(lst)]
return graph_list
def graph_the_components(graph_list):
"""This function graphs all the components constructed by graph_construction"""
for g in graph_list:
plt.figure()
drawnice(g)