-
Notifications
You must be signed in to change notification settings - Fork 0
/
dwd_project.py
364 lines (266 loc) · 14.2 KB
/
dwd_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# -*- coding: utf-8 -*-
"""DWD - Project.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ZNmatGNUC3YOea0LfxZqAG3ea3ahyQEs
"""
import pandas as pd
import numpy as np
from google.colab import files
# Import data of all NYS Health Facities and clean up data to show name, location, type of facility (uploaded CSV file from our google drive)
data = pd.read_csv('Health_Facility_General_Information.csv')
# Keep columns with necessary info
clean_data = data[['Facility Name', 'Description', 'Facility Address 1', 'Facility City', 'Facility Zip Code', 'Facility County', 'Facility Latitude', 'Facility Longitude', 'Facility Location']]
# Limit to just hospitals
hospital_data = clean_data[clean_data['Description'].isin(['Hospital'])]
# Limit hospitals to counties Bronx, Kings, New York, Queens, Richmond
bronx_county = hospital_data[hospital_data['Facility County'].str.contains('Bronx')]
kings_county = hospital_data[hospital_data['Facility County'].str.contains('Kings')]
new_york_county = hospital_data[hospital_data['Facility County'].str.contains('New York')]
queens_county = hospital_data[hospital_data['Facility County'].str.contains('Queens')]
richmond_county = hospital_data[hospital_data['Facility County'].str.contains('Richmond')]
nyc_counties = [bronx_county, kings_county, new_york_county, queens_county, richmond_county]
county_data = pd.concat(nyc_counties)
# Minor touch-ups
county_data = county_data.drop_duplicates(['Facility Name'], keep = 'first') # Remove duplicate hospitals in data
county_data.reset_index(drop = True, inplace = True) # Resets annoying first column numbers
del county_data['Description'] # Delete unnecessary column
county_data['Facility Zip Code'] = county_data['Facility Zip Code'].str[:5] # Make uniform zip codes
final_data = county_data.rename(columns={'Facility Name': 'Hospital', 'Facility Address 1': 'Address', 'Facility City': 'City', 'Facility Zip Code': 'Zip Code', 'Facility County': 'County'}) # Renames column headers
final_data.to_csv('nyc_hospitals.csv')
files.download('nyc_hospitals.csv')
"""# Determining Postive COVID-19 Cases"""
from google.colab import drive
drive.mount('/content/drive')
import re
from re import search
import pandas as pd
import sys
from google.colab import files
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
import requests
url = 'https://api.github.com/repos/nychealth/coronavirus-data/commits'
html = requests.get(url).json()
today = html[0]['sha']
yest= html[0]['parents'][0]['sha']
##
today_url= ('https://raw.githubusercontent.com/nychealth/coronavirus-data/'+today+'/tests-by-zcta.csv')
yest_url= ('https://raw.githubusercontent.com/nychealth/coronavirus-data/'+yest+'/tests-by-zcta.csv')
today_html = StringIO(requests.get(today_url).text)
yest_html = StringIO(requests.get(yest_url).text)
#convert into dataframe and export
cases_data_t = pd.read_csv(today_html, sep=",")
cases_data_t.columns = ["zipcode", "t_positive_cases", "total_cases", "cumulative_perc_positive"]
cases_data_t.drop(index=[0]) #remove row because zipcode= NA
cases_data_y = pd.read_csv(yest_html, sep=",")
cases_data_y.columns = ["zipcode", "y_positive_cases", "total_cases", "cumulative_perc_positive"]
cases_data_y.drop(index=[0]) #remove row because zipcode= NA
new_cases = cases_data_t['t_positive_cases']- cases_data_y['y_positive_cases']
zipcodes= cases_data_t['zipcode']
total_cases= cases_data_t['total_cases']
cumulative_perc_positive= cases_data_t['cumulative_perc_positive']
final_cases_data = pd.concat([zipcodes, new_cases, total_cases, cumulative_perc_positive], axis=1, sort=True)
final_cases_data.columns = ["zipcode", "positive_cases", "total_cases", "cumulative_perc_positive"]
final_cases_data.drop(index=[0]).to_csv('covid_cases_nyc.csv', index= False) #remove row because zipcode= NA
files.download('covid_cases_nyc.csv')
"""# Merging Datasets"""
# This section takes a dataframe with zipcodes and appends on the
# neighborhood and borough as additional columns
# Data is sourced from below and imported as simple csv
# https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm
import pandas as pd
def map_zipcode(input_df, key_column, join_type):
# Read source table and clean data
raw = pd.read_csv("nyc_zipcode_map.csv")
raw['ZIP Codes'] = raw['ZIP Codes'].str.strip()
raw['ZIP Codes'] = raw['ZIP Codes'].str.replace(" ","")
raw['ZIP Codes List'] = raw['ZIP Codes'].str.split(",")
# Initialize clean DataFrame
clean_zipcode_map = pd.DataFrame(columns=['Zipcode_new','Neighborhood','Borough'])
# For every entry in the source table, exctract each zipcode and append Boro and Neighborhood information
for index in raw.index.tolist():
for zipcode in raw.loc[index, 'ZIP Codes List']:
temp_row = {'Zipcode_new': zipcode, 'Neighborhood':raw.loc[index, 'Neighborhood'], 'Borough': raw.loc[index, 'Borough']}
clean_zipcode_map = clean_zipcode_map.append(temp_row, ignore_index=True)
# Merge clean mapped data with user specified input data
df_new = pd.merge(input_df, clean_zipcode_map, left_on=key_column, right_on='Zipcode_new', how=join_type)
# Adjust df_new for one zip code not in table 11040
return df_new
def calc_drivers_needed(input_df, driver_hrs_per_day, driver_passengers_per_trip, neighborhood_time_to_hospital_in_minutes):
# Fudge factor to account for the fact that if passengers per trip > 1 you wont always achieve that with 100% uptime
if driver_passengers_per_trip > 1:
fudge_factor = 0.65
else:
fudge_factor = 1
input_df['Drivers_Needed'] = ((input_df['positive_cases'] * fudge_factor * neighborhood_time_to_hospital_in_minutes * 2) / (driver_hrs_per_day * 60) ).apply(int)
return input_df
# Carry over cases_data from first part of project, clean and map zipcodes
test_case_data = final_cases_data #karina changed
test_case_data.columns = ["zipcode", "positive_cases", "total_cases", "cumulative_perc_positive"]
test_case_data = test_case_data.drop(index=[0])
test_case_data['zipcode'] = test_case_data['zipcode'].astype(int).apply(str)
zipcodes_with_neighborhoods = map_zipcode(test_case_data, 'zipcode', 'right')
# Carry over hospital data from above, rename and map zipcodes
test_hospital_data = final_data
hospitals_with_neighborhoods = map_zipcode(final_data, 'Zip Code', 'left')
# Aggregate cases data with key as neighborhood
neighborhood_cases = zipcodes_with_neighborhoods.groupby('Neighborhood').agg({'positive_cases':'sum', 'total_cases':'sum'})
neighborhood_cases = neighborhood_cases.reset_index()
neighborhood_cases['cumulative_perc_positive'] = round((neighborhood_cases['positive_cases'].apply(float) / neighborhood_cases['total_cases'].apply(float)),4)
neighborhood_cases['cumulative_perc_positive'] = neighborhood_cases['cumulative_perc_positive'].apply(str)
neighborhood_cases['cumulative_perc_positive'] = neighborhood_cases['cumulative_perc_positive'].replace('nan','0')
# Reduce hospital data to neighborhood and list of hospitals
neighborhood_hospitals = hospitals_with_neighborhoods.groupby('Neighborhood')
neighborhood_hospitals = neighborhood_hospitals.apply(lambda x: x['Hospital'].unique()).reset_index(name='Hospital')
# Merge the two tables
neighborhood_aggregate = pd.merge(neighborhood_cases, neighborhood_hospitals, left_on = 'Neighborhood', right_on = 'Neighborhood', how = 'outer')
# Logic for number of drivers required
neighborhood_aggregate = calc_drivers_needed(neighborhood_aggregate, 8, 1, 10)
# Comment to download output files
neighborhood_aggregate.to_csv('neighborhood_aggregate.csv')
files.download('neighborhood_aggregate.csv')
"""# Creating Map to Display Recommendations"""
# Commented out IPython magic to ensure Python compatibility.
# # Install fiona, geopandas, earthpy - need to comment out for transfer to live site.
# # Turn on for running in a notebook
# %%capture
# #!pip install fiona
# #!pip install geopandas
# !pip install earthpy
# Import libraries
import pandas as pd
import numpy as np
import math
import os
import matplotlib.pyplot as plt
import geopandas as gpd
import earthpy as et
import json
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter
from bokeh.palettes import brewer
from bokeh.io.doc import curdoc
from bokeh.models import Slider, HoverTool, Select
from bokeh.layouts import widgetbox, row, column
# Load NYC COVID-19 cases
nyc_data = pd.read_csv('neighborhood_aggregate.csv')
# Check the data
print(nyc_data.shape)
nyc_data.head(5)
# Change data types to integer
nyc_data = nyc_data.astype({'positive_cases': 'int'})
nyc_data = nyc_data.astype({'Drivers_Needed': 'int'})
nyc_data = nyc_data.astype({'Hospital': 'string'})
nyc_data.head()
# Read the geojson map file for Realtor Neighborhoods into a GeoDataframe object
nyc = gpd.read_file('nyc_neighborhoods.geojson')
# Set the Coordinate Referance System (crs) for projections
# ESPG code 4326 is also referred to as WGS84 lat-long projection
nyc.crs = {'init': 'epsg:4326'}
# Keep columns with necessary info
nyc = nyc[['PO_NAME', 'COUNTY', 'SHAPE_LEN', 'geometry']]
# Rename columns in geojson map file
nyc = nyc.rename(columns={'geometry': 'geometry','PO_NAME':'Neighborhood'}).set_geometry('geometry')
# Fix 2 neighborhood names
nyc['Neighborhood']= nyc['Neighborhood'].str.replace('Inwood and Washington Height', 'Inwood and Washington Heights')
nyc['Neighborhood']= nyc['Neighborhood'].str.replace('Gramercy Park and Murray Hil', 'Gramercy Park and Murray Hill')
nyc.head(50)
nyc.plot(column = 'SHAPE_LEN')
#plt.show()
# This dictionary contains the formatting for the data in the plots
format_data = [('positive_cases', 0, 210,'0,0', 'Number of Positive COVID-19 Cases'),
('Drivers_Needed', 0, 10,'0,0', 'Number of Drivers Needed')
]
#Create a DataFrame object from the dictionary
format_df = pd.DataFrame(format_data, columns = ['field' , 'min_range', 'max_range' , 'format', 'verbage'])
format_df.head(10)
# Create a function the returns json_data for positive cases
from google.colab import files
def json_data(positiveCases):
pc = positiveCases
# Pull selected COVID19 Cases from neighborhood summary data
df_pc = nyc_data[nyc_data['positive_cases'] == pc]
#Merge the GeoDataframe object (nyc_data) with the neighborhood summary data (neighborhood)
merged = pd.merge(nyc, nyc_data, on='Neighborhood', how='left')
# Fill the null values
values = {'positive_cases': 0, 'Drivers_Needed': 0, 'Hospital': ''}
merged = merged.fillna(value=values)
# Bokeh uses geojson formatting, representing geographical features, with json
# Convert to json
nyc_json = json.loads(merged.to_json())
# Convert to json preferred string-like object
json_data = json.dumps(nyc_json)
return json_data
# Define the callback function: update_plot
def update_plot(attr, old, new):
# The input yr is the year selected from the slider
#yr = slider.value
new_data = json_data(nyc_data)
# The input cr is the criteria selected from the select box
cr = select.value
input_field = format_df.loc[format_df['verbage'] == cr, 'field'].iloc[0]
# Update the plot based on the changed inputs
p = make_plot(input_field)
#Update the layout, clear the old document and display the new document
layout = column(p, widgetbox(select), widgetbox(slider))
curdoc().clear()
curdoc().add_root(layout)
# Update the data
geosource.geojson = new_data
# Create a plotting function
def make_plot(field_name):
# Set the format of the colorbar
min_range = format_df.loc[format_df['field'] == field_name, 'min_range'].iloc[0]
max_range = format_df.loc[format_df['field'] == field_name, 'max_range'].iloc[0]
field_format = format_df.loc[format_df['field'] == field_name, 'format'].iloc[0]
# Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
color_mapper = LinearColorMapper(palette = palette, low = min_range, high = max_range)
# Create color bar.
format_tick = NumeralTickFormatter(format=field_format)
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=18, formatter=format_tick,
border_line_color=None, location = (0, 0))
# Create figure object.
verbage = format_df.loc[format_df['field'] == field_name, 'verbage'].iloc[0]
p = figure(title = verbage + ' by NYC Neighborhood',
plot_height = 650, plot_width = 850,
toolbar_location = "below")
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.axis.visible = False
# Add patch renderer to figure.
p.patches('xs','ys', source = geosource, fill_color = {'field' : field_name, 'transform' : color_mapper},
line_color = 'black', line_width = 0.25, fill_alpha = 1)
# Specify color bar layout.
p.add_layout(color_bar, 'right')
# Add the hover tool to the graph
p.add_tools(hover)
return p
# Input geojson source that contains features for plotting for:
geosource = GeoJSONDataSource(geojson = json_data(nyc_data))
input_field = 'positive_cases'
# Define a sequential multi-hue color palette.
palette = brewer['Blues'][4]
# Reverse color order so that dark blue is highest COVID-19 cases.
palette = palette[::-1]
# Add hover tool
hover = HoverTool(tooltips = [ ('Neighborhood','@Neighborhood'),
('# of Positive Cases', '@positive_cases'),
('# of Drivers Needed', '@Drivers_Needed'),
('Hospital(s)', '@Hospital')])
# Call the plotting function
p = make_plot(input_field)
# Make a selection object: select
select = Select(title='Select Criteria:', value='Number of Positive COVID-19 Cases', options=['Number of Drivers Needed'])
select.on_change('value', update_plot)
# Make a column layout of widgetbox(slider) and plot, and add it to the current document
# Display the current document
#layout = column(p, widgetbox(select), widgetbox(slider))
#curdoc().add_root(layout)
# Use the following code to test in a notebook
# Interactive features will not show in notebook
#output_notebook()
#show(p)