-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
269 lines (223 loc) · 10.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
'''Code for the program to perform the analysis and crime category prediction
Author: Prasanth Varikallu'''
#Packages needed
from tkinter import *
import pandas as pd,glob,math,random
from haversine import haversine
import geocoder
from string import capwords
import matplotlib.pyplot as plt
import numpy as np,seaborn as sns
#Fields to display in the GUI
fields = ('Zipcode', 'Hour of the day')
txt =""
#A dictionary holding the police district and their corresponding zip codes.
DistLatLon = {1:'19145',2:'19149',3:'19147',5:'19128',6:'19107',7:'19115',
8:'19154',9:'19130',12:'19142',14:'19144',15:'19149',16:'19104',
17:'19146',18:'19143',19:'19151',22:'19121',24:'19124',25:'19124',
26:'19125',35:'19141',39:'19140',77:'19153'}
'''The below steps are used to create a pandas data frame from multiple csv files
and parse date time columns in the created data frame.'''
frame = pd.DataFrame()
list_ = []
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
allFiles = glob.glob("./Incidents/*.csv")
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0, parse_dates=['DISPATCH_DATE'], date_parser=dateparse)
list_.append(df)
frame = pd.concat(list_)
'''In the below code, we operate on the data by selecting only the necessary columns
and also processing the date and time parts of the code'''
frame=frame[["DC_DIST","DISPATCH_TIME","TEXT_GENERAL_CODE","DISPATCH_DATE"]]
frame['DISPATCH_TIME'] = frame['DISPATCH_TIME'].str.extract('(\d+):\d+:\d+')
frame['DISPATCH_TIME'] = frame['DISPATCH_TIME'].astype(int)
frame['WEEKDAY'] = frame['DISPATCH_DATE'].dt.dayofweek
'''The function used below is used to create a bar graph.
It was taken from a script in kaggle.com'''
def plot_bar(df, title, filename):
grids = ['darkgrid', 'whitegrid', 'dark', 'white', 'ticks']
sns.set_style(random.choice(grids))#Setting the style for the plot display window
sns.despine()
p = (
'Set2', 'Paired', 'colorblind', 'husl', #This set is used for the random color generation in the bar plots
'Set1', 'coolwarm', 'RdYlGn', 'spectral'
)
vals = ['bar','barh']
bar = df.plot(kind=random.choice(vals),#Setting the bar plot characteristics
title=title,
fontsize=8,
figsize=(12,8),
stacked=False,
width=1,
color = sns.color_palette(np.random.choice(p), len(df)),
)
bar.figure.savefig(filename)#Save the figure locally.
plt.show()#Shows the bar plot in a window.
#The below function is used to set up the top crime data to be plotted into a graph.
def plot_top_crimes(df, column, title, fname, items=0):
by_col = df.groupby(column)
col_freq = by_col.size()#Creates a frequency count data format.
col_freq.index = col_freq.index.map(capwords)
col_freq.sort(ascending=True, inplace=True)#Sorting the column.
plot_bar(col_freq[slice(-1, - items, -1)], title, fname)#Calling the plot_bar function
'''The below functions do the same thing as the above function,
except that they give different statistics.
1. weekday_stats will give weekly crime distribution.
2. district_wise will give district wise counts of the crime categories
3. time_of_day will give hourly distribution of the categories.'''
def weekday_stats(df):
by_col = df.groupby("WEEKDAY")
col_freq = by_col.size()
col_freq.sort(ascending=True, inplace=True)
plot_bar(col_freq, "Daywise Counts", "weekday_stats.png")
def district_wise(df):
by_col = df.groupby("DC_DIST")
col_freq = by_col.size()
col_freq.sort(ascending=True, inplace=True)
plot_bar(col_freq, "Districtwise Counts", "district_stats.png")
def time_of_day(df):
by_col = df.groupby("DISPATCH_TIME")
col_freq = by_col.size()
plot_bar(col_freq, "Crime Counts based on time of day", "timeofday_stats.png")
def total_counts(df):
by_col = df.groupby("TEXT_GENERAL_CODE")
col_freq = by_col.size()
col_freq.sort(ascending=True, inplace=True)
plot_bar(col_freq, "Total Crime Counts", "Crime_stats.png")
#The below function gives the frequency counts of the crimes given a time and zipcode
def freq_counts(entries,frame):
district = 0
min_dis = 25.0
time = int(entries['Hour of the day'].get()) #The .get() method is used to fetch the values from the GUI text box
myzip = entries['Zipcode'].get()
zipcd = geocoder.google(myzip) #Converts the entered zip code to latitudes and longitudes.
for key in DistLatLon:
if min_dis == 0.0:
break
i = DistLatLon[key]
pzip = geocoder.google(i)
if (pzip.latlng):
dis = haversine(zipcd.latlng,pzip.latlng,miles = True)#Haversine function calculates the distance
#between two locations given their latitudes and longitudes.
if dis < min_dis:
min_dis = dis
district = key#Identifying which district the given zipcode falls under.
else: #The else block des the same thing as the if block. This is due a bug in geocoder library.
#Sometimes, it doesn't fetch the latitudes and lonitudes as expected.
pzip = geocoder.google(i)
dis = haversine(zipcd.latlng,pzip.latlng,miles = True)
if dis < min_dis:
min_dis = dis
district = key
#Filtering the data frames with the district and time given.
frame= frame[(frame["DC_DIST"]==district) & (frame["DISPATCH_TIME"]==time)]
frame=frame[["DC_DIST","DISPATCH_TIME","TEXT_GENERAL_CODE"]]
#Calling the plotting function to set it ready for plotting.
plot_top_crimes(frame, 'TEXT_GENERAL_CODE','Top Crime Categories','category.png')
#The below function applies the naive bayes classifier to the training data and predict the results.
def naive_bayes(entries,frame):
txt = ""
district = 0
min_dis = 25.0
docsCount = len(frame) #Counting the total number of rows.
#Creating the vocabulary.
vocabulary = pd.unique(frame["DC_DIST"]).tolist()
time_list = pd.unique(frame["DISPATCH_TIME"]).tolist()
vocabulary.extend(time_list)
vocabulary = sorted(set(vocabulary))
v_len = len(vocabulary)#Calculating the length of library.
#The .get() method is used to fetch the values from the GUI text box
time = int(entries['Hour of the day'].get())
myzip = entries['Zipcode'].get()
#Calculating the number of documents in each class
by_col = frame.groupby("TEXT_GENERAL_CODE")
col_freq = by_col.size().to_dict()
def COUNTTOKENSOFTERM(df, t): #Function to calculate number of occurances of a term in the documents related to the class
cnt = 0
cnt += len(df[df['DC_DIST']==t])
cnt += len(df[df['DISPATCH_TIME']==t])
return cnt
#Getting the classes as a list.
classes = list(col_freq.keys())
prior = {}#Dictionary to calculate prior class probability
condprob = []#List to hold conditional probility.
for cls in classes:
NinC = col_freq[cls]
prior[cls] = NinC/docsCount
denominator = col_freq[cls] * 2 + v_len
df = frame[frame['TEXT_GENERAL_CODE'] == cls]
for t in vocabulary:
Tct = COUNTTOKENSOFTERM(df[["DC_DIST","DISPATCH_TIME"]], t) #count of all occurances of a term in the class.
condprob.append((t,cls,(Tct+1)/denominator)) #Tct+1 indicates the laplace smoothing
#Converting zipcode to latitides and logitudes.
#The if and else staments do the same thing. They identify the police district for the given zipcode.
zipcd = geocoder.google(myzip)
for key in DistLatLon:
if min_dis == 0.0:
break
i = DistLatLon[key]
pzip = geocoder.google(i)
if (pzip.latlng):
dis = haversine(zipcd.latlng,pzip.latlng,miles = True)
if dis < min_dis:
min_dis = dis
district = key
else:
pzip = geocoder.google(i)
dis = haversine(zipcd.latlng,pzip.latlng,miles = True)
if dis < min_dis:
min_dis = dis
district = key
txt+="The entered zipcode falls under police district number: "+str(district)+"\n\n"
doc_rd = [district,time]
best_class_scores = {}
for c in prior: #loop to calculate for each class
best_class_scores[c] = math.log(prior[c])
for w in doc_rd: #loop to iterate through all words in the document and claculate the score
for tup in condprob:
if tup[0] == w and tup[1] == c: #matching the tuple and class to get the probability value
best_class_scores[c] += math.log(tup[2]) #log calculations are used to avoid floating point underflow
txt+="For the given location and time:\n\n"
for c in best_class_scores:
t1 = "Crime Category: "+c
t2 = "Score: "+str('{0:.2f}'.format(best_class_scores[c]))
txt+=t1.ljust(60,'-') + t2+"\n"
txt+="\n\nBEST MATCHED CATEGORY: "+str(max(best_class_scores, key=best_class_scores.get))+"\n\n"
msg.config(text = txt) #This function puts the output in the GUI window.
#This fuction is used to create the GUI for the project.
def makeform(root, fields):
entries = {}
for field in fields:
row = Frame(root)
lab = Label(row, width=25, text=field+": ", anchor='w')
ent = Entry(row)
ent.insert(0,"0")
row.pack(side=TOP, fill=X, padx=5, pady=5)
lab.pack(side=LEFT)
ent.pack(side=RIGHT, expand=YES, fill=X)
entries[field] = ent
return entries
#This is the main program.
if __name__ == '__main__':
root = Tk()
root.state('zoomed')
ents = makeform(root, fields)
#The buttons are declared below.
b = Button(root, text='Using Naive Bayes',
command=(lambda e=ents: naive_bayes(e,frame)))
b.pack()
b = Button(root, text='Frequency Counts',
command=(lambda e=ents: freq_counts(e,frame)))
b.pack()
b = Button(root, text='Weekday Counts',
command=(lambda e=ents: weekday_stats(frame)))
b.pack(side = LEFT)
b = Button(root, text='Districtwise Counts',
command=(lambda e=ents: district_wise(frame)))
b.pack(side = LEFT)
b = Button(root, text='Hourly Counts',
command=(lambda e=ents: time_of_day(frame)))
b.pack(side = LEFT)
msg = Message(root,relief = SUNKEN)
msg.pack(side = RIGHT,padx = 30)
root.mainloop()