forked from manelone/Walk-This-Way
-
Notifications
You must be signed in to change notification settings - Fork 0
/
edge_connects.py
292 lines (238 loc) · 9.96 KB
/
edge_connects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
from pandas import pandas as pd
import collections, math, random, sys, time, datetime
from copy import deepcopy
#weights of different types of crimes
CRIME_TYPE_WEIGHTS = {'ROBBERY':5, 'SEX OFFENSES, FORCIBLE':6,'DRUG/NARCOTIC':2, 'KIDNAPPING':7, 'SEX OFFENSES, NON FORCIBLE':3, 'ASSAULT':9}
#number of regions to divide the city into for k-means clustering
NUM_REGIONS = 10
edges = pd.read_csv("trimmed_edges.csv")
#crimes = pd.read_csv("crimes_with_streets.csv")
crimes = pd.read_csv("mini_crimes_set.csv")
testCrimes = pd.read_csv("test_crime_data.csv")
#a dictionary from edgeIDs to CrimeStreetObjects
streets = {}
#a dictionary from crimeStreets to a list of known crimes read in from testCrimes (crime type, time)
knownCrimes = {}
def getDistance(a,b):
return (a[0] - b[0])*(a[0] - b[0]) + (a[1] - b[1])*(a[1] - b[1])
def kmeans(crimes, K, maxIters):
'''
crimes: list of crime location and closest street pairs, ((lat, long), closestEdge)
K: number of desired clusters. Assume that 0 < K <= |examples|.
maxIters: maximum number of iterations to run for (you should terminate early if the algorithm converges).
Return: (length K list of cluster centroids,
list of assignments, (i.e. if crimes[i] belongs to centers[j], then assignments[i] = j)
final reconstruction loss)
'''
#returns the index of the closest centroid to the given crime's location
def closestCentroid (centroids, example):
minDistance = sys.maxint
closest = 0
for i in range(K):
dist = getDistance(centroids[i], example[0])
if dist < minDistance:
closest = i
minDistance = dist
return closest
numCrimes = len(crimes)
#initialize random centroids and assingment list
#c = random.sample(crimes, K)
c = [crimes[1],crimes[10],crimes[20],crimes[21],crimes[90],crimes[101],crimes[8],crimes[33],crimes[12],crimes[87] ]
centroids = [0]*K
for i in range(K):
centroids[i] = deepcopy(c[i][0])
assignments = [0] * numCrimes
#run k-means clustering
for iteration in range(maxIters):
#keep track of original assignments
oldAssignments = list(assignments)
#update assignments
for i in range(numCrimes):
assignments[i] = closestCentroid(centroids, crimes[i])
#check for convergence
if assignments == oldAssignments:
break
#clear centroid values:
for i in range(K):
centroids [i] = (0,0)
#update centroids through addition:
for j in range(numCrimes):
oldValue = centroids[assignments[j]]
newValue = (oldValue[0]+crimes[j][0][0], oldValue[1]+crimes[j][0][1])
centroids[assignments[j]] = newValue
#update centroids through division
for k in range(K):
oldValue = centroids[k]
newValue = oldValue
if assignments.count(k) != 0:
newValue = (oldValue[0]/(assignments.count(k)*1.0), oldValue[1]/(assignments.count(k)*1.0))
centroids[k] = newValue
return (centroids, assignments)
def inRange(start, end, timeString):
t = time.strptime(timeString, "%A,%m/%d/%y,%H:%M")
formattedTime = datetime.datetime.fromtimestamp(time.mktime(t))
if formattedTime >= start and formattedTime <= end:
return True
return False
class CrimeStreet():
def __init__(self, edgeID, start, end, length):
self.edgeID = edgeID
self.start = start
self.end = end
self.st_length = length
self.crimes = collections.Counter()
self.crimeList = []
self.regionCrimeScore = 0
self.numCrimes = 0
#regionScore is avg of crime score of hotspot / distance of street from hotspot over all hotspots^2
#weighted by the distance to that hotspot
#regionCrimeScores is a dictionary of locations to their scores
def setRegionScore(self, regionCrimeScores):
regionCrimeScore = 0
for centroid in regionCrimeScores.keys():
dist = self.distFromStreet(centroid)
regionCrimeScore += regionCrimeScores[centroid]/(dist)
self.regionCrimeScore = regionCrimeScore*1.0/(NUM_REGIONS**2)
#getTimedCrimeScore returns the sum of the regional crimes score as well as the weighted crimes
#that occur during a given time period surrounding the startTime
def getTimedCrimeScore(self, startTime):
crimeScore = 0
for i in range(2003,2014):
diff = startTime.year - i
numWeeks = diff * 72
standardStartTime = startTime - datetime.timedelta(weeks=numWeeks)
start = standardStartTime - datetime.timedelta(minutes=30)
end = standardStartTime + datetime.timedelta(minutes=60)
crimeScore = 0
for crime in self.crimeList:
if inRange(start, end, crime[1]):
crimeScore += CRIME_TYPE_WEIGHTS[crime[0]]
crimeScore /= 1.0*11
return crimeScore + self.regionCrimeScore
def knownCrimeScore(self, startTime):
crimeScore = 0
if self in knownCrimes.keys():
crimes = knownCrimes[self]
start = startTime - datetime.timedelta(minutes=30)
end = startTime + datetime.timedelta(minutes=60)
for crime in crimes:
crimeType = crime[0]
crimeTime = crime[1]
if crimeTime>=start and crimeTime <=end:
print crime
crimeScore += CRIME_TYPE_WEIGHTS[crimeType]
return crimeScore
#returns regional crime score
def getregionCrimeScore(self):
return self.regionCrimeScore
#returns the sum of the regional crime score and the sum of all crimes (weigthed by type) that have
#ever occured on that street
def getCrimeScore(self):
if self.numCrimes == 0: return 0
self.streetCrimeScore = sum(self.crimes[c] for c in self.crimes)
#print('self: ' + str(self.streetCrimeScore) + ' region: '+str(self.regionCrimeScore))
return self.streetCrimeScore + self.regionCrimeScore
#adds a given crime to the crimes counter, the crimes list, and increments the total
#number of crimes seen on this street
def addCrime(self, crimeOccurence):
self.numCrimes += 1
self.crimes[crimeOccurence[0]] += CRIME_TYPE_WEIGHTS[crimeOccurence[0]]
self.crimeList.append(crimeOccurence)
#calculates the distance from a given location to the street
def distFromStreet(self, loc):
slope = (self.end[1]-self.start[1]) / (self.end[0]-self.start[0])
perp_slope = -1/slope
#print slope, perp_slope
b = self.start[1] - slope*self.start[0]
b2 = loc[1] - perp_slope*loc[0]
#print b, b2
dist_lat = (b2 + b) / (slope - perp_slope)
dist_long = dist_lat * slope + b
if dist_lat < min([self.start[0], self.end[0]]) or \
dist_lat > max([self.start[0], self.end[0]]) or \
dist_long < min([self.start[1], self.end[1]])or \
dist_long > max([self.start[1], self.end[1]]):
return min([math.sqrt((self.end[0]-loc[0])**2 + (self.end[1]-loc[1])**2), \
math.sqrt((self.start[0]-loc[0])**2 + (self.start[1]-loc[1])**2)])
#print dist_lat, dist_long
dist = math.sqrt((dist_lat-loc[0])**2 + (dist_long-loc[1])**2)
return dist
#establishes the crimeStreets (and populates the streets dictionary) by reading
#through the various data files
def estStreets():
for edge in edges.iterrows():
e = edge[1]
curr = CrimeStreet(e['EdgeID'], eval(e['startCoords']), eval(e['endCoords']), float(e['distance']))
streets[e['EdgeID']] = curr
print 'established streets as CrimeStreet vars'
crimesList = []
for i, crime in crimes.iterrows():
e = crime['StreetMatch']
timeString = crime['DayOfWeek']+ ',' + crime['Date']+ ',' +crime['Time']
streets[e].addCrime((crime['Category'],timeString))
crimesList.append((eval(crime['Location']),crime['Category']))
print 'added crimes to streets and established crimesList for k-means clustering'
hotspots, assignments = kmeans(crimesList, NUM_REGIONS, 10)
print 'established 10 crime hotspot assignments using k-means clustering'
hotspotCrimeScores = collections.Counter()
for i in range(len(assignments)):
hotspot = hotspots[assignments[i]]
crime = crimesList[i][1]
crimeLoc = crimesList[i][0]
hotspotCrimeScores[hotspot] += (CRIME_TYPE_WEIGHTS[crime]/(getDistance(hotspot, crimeLoc)+1))
for i in range(len(hotspots)):
if assignments.count(i) > 0:
hotspotCrimeScores[hotspots[i]] /= (assignments.count(i)*1.0)
print 'updated hotspot scores'
for edge in streets:
streets[edge].setRegionScore(hotspotCrimeScores)
streets[edge].getCrimeScore()
print 'updated crimeRegionScore for each crimeStreet'
#return streets
#establishes a node dictionary so that intersections may be referenced as well as streets
def nodeDict():
edge_dict = {}
estStreets()
for st in streets:
edge = streets[st]
startCoords = edge.start #eval(edge['startCoords'])
endCoords = edge.end #eval(edge['endCoords'])
if startCoords[1] > -121.888 or startCoords[1] < -122.729:
continue
if startCoords[0] > 38.5 or startCoords[0] < 37.5:
continue
if endCoords[1] > -121.888 or endCoords[1] < -122.729:
continue
if endCoords[0] > 38.5 or endCoords[0] < 37.5:
continue
if startCoords not in edge_dict:
edge_dict[startCoords] = set([edge])
else:
edge_dict[startCoords].add(edge)
if endCoords not in edge_dict:
edge_dict[endCoords] = set([edge])
else:
edge_dict[endCoords].add(edge)
readKnownCrimes()
return edge_dict
#creates a dictionary from CrimeStreets to a list of crime type/datetimes tuples
#that represent crimes and the times they were committed on that street
#{CrimeStreet:(Type, Datetime)}
def readKnownCrimes():
for i, crime in testCrimes.iterrows():
e = crime['StreetMatch']
timeString = crime['DayOfWeek']+ ',' + crime['Date']+ ',' +crime['Time']
street = streets[e]
if street not in knownCrimes.keys():
knownCrimes[street] = []
tm = time.strptime(timeString, "%A,%m/%d/%y,%H:%M")
formattedTm = datetime.datetime.fromtimestamp(time.mktime(tm))
knownCrimes[street].append((crime['Category'],formattedTm))
print 'finished reading crime_test_data'
#return knownCrimes
#edge_dict = nodeDict()
#knownCrimes = readKnownCrimes()
#print sum(1.0*len(edge_dict[node]) for node in edge_dict) / len(edge_dict.keys())
# streets = estStreets()
# for st in streets:
# print streets[st].crimes