dwd_project.py

# -*- coding: utf-8 -*-
"""DWD - Project.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ZNmatGNUC3YOea0LfxZqAG3ea3ahyQEs
"""

import pandas as pd
import numpy as np
from google.colab import files

# Import data of all NYS Health Facities and clean up data to show name, location, type of facility (uploaded CSV file from our google drive)
data = pd.read_csv('Health_Facility_General_Information.csv')

# Keep columns with necessary info
clean_data = data[['Facility Name', 'Description', 'Facility Address 1', 'Facility City', 'Facility Zip Code', 'Facility County', 'Facility Latitude', 'Facility Longitude', 'Facility Location']]

# Limit to just hospitals
hospital_data = clean_data[clean_data['Description'].isin(['Hospital'])]

# Limit hospitals to counties Bronx, Kings, New York, Queens, Richmond
bronx_county = hospital_data[hospital_data['Facility County'].str.contains('Bronx')]
kings_county = hospital_data[hospital_data['Facility County'].str.contains('Kings')]
new_york_county = hospital_data[hospital_data['Facility County'].str.contains('New York')]
queens_county = hospital_data[hospital_data['Facility County'].str.contains('Queens')]
richmond_county = hospital_data[hospital_data['Facility County'].str.contains('Richmond')]
nyc_counties = [bronx_county, kings_county, new_york_county, queens_county, richmond_county]

county_data = pd.concat(nyc_counties)

# Minor touch-ups
county_data = county_data.drop_duplicates(['Facility Name'], keep = 'first') # Remove duplicate hospitals in data
county_data.reset_index(drop = True, inplace = True) # Resets annoying first column numbers
del county_data['Description'] # Delete unnecessary column
county_data['Facility Zip Code'] = county_data['Facility Zip Code'].str[:5] # Make uniform zip codes
final_data = county_data.rename(columns={'Facility Name': 'Hospital', 'Facility Address 1': 'Address', 'Facility City': 'City', 'Facility Zip Code': 'Zip Code', 'Facility County': 'County'}) # Renames column headers
    
final_data.to_csv('nyc_hospitals.csv') 
files.download('nyc_hospitals.csv')

"""# Determining Postive COVID-19 Cases"""

from google.colab import drive
drive.mount('/content/drive')

import re
from re import search 
import pandas as pd
import sys
from google.colab import files

if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
import requests

url = 'https://api.github.com/repos/nychealth/coronavirus-data/commits'
html = requests.get(url).json()
today = html[0]['sha']
yest= html[0]['parents'][0]['sha']

##
today_url= ('https://raw.githubusercontent.com/nychealth/coronavirus-data/'+today+'/tests-by-zcta.csv')
yest_url= ('https://raw.githubusercontent.com/nychealth/coronavirus-data/'+yest+'/tests-by-zcta.csv')

today_html = StringIO(requests.get(today_url).text)
yest_html = StringIO(requests.get(yest_url).text)


#convert into dataframe and export
cases_data_t = pd.read_csv(today_html, sep=",")
cases_data_t.columns = ["zipcode", "t_positive_cases", "total_cases", "cumulative_perc_positive"]
cases_data_t.drop(index=[0]) #remove row because zipcode= NA

cases_data_y = pd.read_csv(yest_html, sep=",")
cases_data_y.columns = ["zipcode", "y_positive_cases", "total_cases", "cumulative_perc_positive"]
cases_data_y.drop(index=[0]) #remove row because zipcode= NA

new_cases = cases_data_t['t_positive_cases']- cases_data_y['y_positive_cases']
zipcodes= cases_data_t['zipcode']
total_cases= cases_data_t['total_cases']
cumulative_perc_positive= cases_data_t['cumulative_perc_positive']

final_cases_data = pd.concat([zipcodes, new_cases, total_cases, cumulative_perc_positive], axis=1, sort=True)
final_cases_data.columns = ["zipcode", "positive_cases", "total_cases", "cumulative_perc_positive"]

final_cases_data.drop(index=[0]).to_csv('covid_cases_nyc.csv', index= False) #remove row because zipcode= NA

files.download('covid_cases_nyc.csv')

"""# Merging Datasets"""

# This section takes a dataframe with zipcodes and appends on the 
# neighborhood and borough as additional columns
# Data is sourced from below and imported as simple csv
# https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm

import pandas as pd


def map_zipcode(input_df, key_column, join_type):
 
  # Read source table and clean data
  raw = pd.read_csv("nyc_zipcode_map.csv")
  raw['ZIP Codes'] = raw['ZIP Codes'].str.strip()
  raw['ZIP Codes'] = raw['ZIP Codes'].str.replace(" ","")
  raw['ZIP Codes List'] = raw['ZIP Codes'].str.split(",")

  # Initialize clean DataFrame
  clean_zipcode_map = pd.DataFrame(columns=['Zipcode_new','Neighborhood','Borough'])

  # For every entry in the source table, exctract each zipcode and append Boro and Neighborhood information
  for index in raw.index.tolist():
    for zipcode in raw.loc[index, 'ZIP Codes List']:
      temp_row = {'Zipcode_new': zipcode, 'Neighborhood':raw.loc[index, 'Neighborhood'], 'Borough': raw.loc[index, 'Borough']}
      clean_zipcode_map = clean_zipcode_map.append(temp_row, ignore_index=True)

  # Merge clean mapped data with user specified input data
  df_new = pd.merge(input_df, clean_zipcode_map, left_on=key_column, right_on='Zipcode_new', how=join_type)

  # Adjust df_new for one zip code not in table 11040

  return df_new

def calc_drivers_needed(input_df, driver_hrs_per_day, driver_passengers_per_trip, neighborhood_time_to_hospital_in_minutes):

  # Fudge factor to account for the fact that if passengers per trip > 1 you wont always achieve that with 100% uptime
  if driver_passengers_per_trip > 1:
    fudge_factor = 0.65
  else:
    fudge_factor = 1

  input_df['Drivers_Needed'] = ((input_df['positive_cases'] * fudge_factor * neighborhood_time_to_hospital_in_minutes * 2) / (driver_hrs_per_day * 60) ).apply(int)

  return input_df


# Carry over cases_data from first part of project, clean and map zipcodes
test_case_data = final_cases_data #karina changed
test_case_data.columns = ["zipcode", "positive_cases", "total_cases", "cumulative_perc_positive"]
test_case_data = test_case_data.drop(index=[0])
test_case_data['zipcode'] = test_case_data['zipcode'].astype(int).apply(str)

zipcodes_with_neighborhoods = map_zipcode(test_case_data, 'zipcode', 'right')

# Carry over hospital data from above, rename and map zipcodes
test_hospital_data = final_data
hospitals_with_neighborhoods = map_zipcode(final_data, 'Zip Code', 'left')

# Aggregate cases data with key as neighborhood
neighborhood_cases = zipcodes_with_neighborhoods.groupby('Neighborhood').agg({'positive_cases':'sum', 'total_cases':'sum'})
neighborhood_cases = neighborhood_cases.reset_index()
neighborhood_cases['cumulative_perc_positive'] = round((neighborhood_cases['positive_cases'].apply(float) / neighborhood_cases['total_cases'].apply(float)),4)
neighborhood_cases['cumulative_perc_positive'] = neighborhood_cases['cumulative_perc_positive'].apply(str)
neighborhood_cases['cumulative_perc_positive'] = neighborhood_cases['cumulative_perc_positive'].replace('nan','0')

# Reduce hospital data to neighborhood and list of hospitals
neighborhood_hospitals = hospitals_with_neighborhoods.groupby('Neighborhood')
neighborhood_hospitals = neighborhood_hospitals.apply(lambda x: x['Hospital'].unique()).reset_index(name='Hospital')

# Merge the two tables
neighborhood_aggregate = pd.merge(neighborhood_cases, neighborhood_hospitals, left_on = 'Neighborhood', right_on = 'Neighborhood', how = 'outer')

# Logic for number of drivers required
neighborhood_aggregate = calc_drivers_needed(neighborhood_aggregate, 8, 1, 10)

# Comment to download output files
neighborhood_aggregate.to_csv('neighborhood_aggregate.csv')
files.download('neighborhood_aggregate.csv')

"""# Creating Map to Display Recommendations"""

# Commented out IPython magic to ensure Python compatibility.
# # Install fiona, geopandas, earthpy - need to comment out for transfer to live site.
# # Turn on for running in a notebook
# %%capture
# #!pip install fiona
# #!pip install geopandas
# !pip install earthpy

# Import libraries
import pandas as pd
import numpy as np
import math

import os
import matplotlib.pyplot as plt
import geopandas as gpd
import earthpy as et
import json

from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, NumeralTickFormatter
from bokeh.palettes import brewer

from bokeh.io.doc import curdoc
from bokeh.models import Slider, HoverTool, Select
from bokeh.layouts import widgetbox, row, column

# Load NYC COVID-19 cases
nyc_data = pd.read_csv('neighborhood_aggregate.csv')

# Check the data
print(nyc_data.shape)
nyc_data.head(5)

# Change data types to integer 
nyc_data = nyc_data.astype({'positive_cases': 'int'})
nyc_data = nyc_data.astype({'Drivers_Needed': 'int'})
nyc_data = nyc_data.astype({'Hospital': 'string'})

nyc_data.head()

# Read the geojson map file for Realtor Neighborhoods into a GeoDataframe object
nyc = gpd.read_file('nyc_neighborhoods.geojson')

# Set the Coordinate Referance System (crs) for projections
# ESPG code 4326 is also referred to as WGS84 lat-long projection
nyc.crs = {'init': 'epsg:4326'}

# Keep columns with necessary info
nyc = nyc[['PO_NAME', 'COUNTY', 'SHAPE_LEN', 'geometry']]

# Rename columns in geojson map file
nyc = nyc.rename(columns={'geometry': 'geometry','PO_NAME':'Neighborhood'}).set_geometry('geometry')

# Fix 2 neighborhood names
nyc['Neighborhood']= nyc['Neighborhood'].str.replace('Inwood and Washington Height', 'Inwood and Washington Heights') 
nyc['Neighborhood']= nyc['Neighborhood'].str.replace('Gramercy Park and Murray Hil', 'Gramercy Park and Murray Hill') 

nyc.head(50)

nyc.plot(column = 'SHAPE_LEN')
#plt.show()

# This dictionary contains the formatting for the data in the plots
format_data = [('positive_cases', 0, 210,'0,0', 'Number of Positive COVID-19 Cases'),
               ('Drivers_Needed', 0, 10,'0,0', 'Number of Drivers Needed')
              ] 
 
#Create a DataFrame object from the dictionary 
format_df = pd.DataFrame(format_data, columns = ['field' , 'min_range', 'max_range' , 'format', 'verbage'])
format_df.head(10)

# Create a function the returns json_data for positive cases
from google.colab import files
def json_data(positiveCases):
    pc = positiveCases
    
    # Pull selected COVID19 Cases from neighborhood summary data
    df_pc = nyc_data[nyc_data['positive_cases'] == pc]
    
    #Merge the GeoDataframe object (nyc_data) with the neighborhood summary data (neighborhood)
    merged = pd.merge(nyc, nyc_data, on='Neighborhood', how='left')
    
    # Fill the null values
    values = {'positive_cases': 0, 'Drivers_Needed': 0, 'Hospital': ''}
    merged = merged.fillna(value=values)
    
    # Bokeh uses geojson formatting, representing geographical features, with json
    # Convert to json
    nyc_json = json.loads(merged.to_json())
    
    # Convert to json preferred string-like object 
    json_data = json.dumps(nyc_json)
    return json_data


# Define the callback function: update_plot
def update_plot(attr, old, new):
    # The input yr is the year selected from the slider
    #yr = slider.value
    new_data = json_data(nyc_data)
    
    # The input cr is the criteria selected from the select box
    cr = select.value
    input_field = format_df.loc[format_df['verbage'] == cr, 'field'].iloc[0]
    
    # Update the plot based on the changed inputs
    p = make_plot(input_field)
    
    #Update the layout, clear the old document and display the new document
    layout = column(p, widgetbox(select), widgetbox(slider))
    curdoc().clear()
    curdoc().add_root(layout)
    
    # Update the data
    geosource.geojson = new_data 
    

# Create a plotting function
def make_plot(field_name):    
  # Set the format of the colorbar
  min_range = format_df.loc[format_df['field'] == field_name, 'min_range'].iloc[0]
  max_range = format_df.loc[format_df['field'] == field_name, 'max_range'].iloc[0]
  field_format = format_df.loc[format_df['field'] == field_name, 'format'].iloc[0]

  # Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
  color_mapper = LinearColorMapper(palette = palette, low = min_range, high = max_range)

  # Create color bar.
  format_tick = NumeralTickFormatter(format=field_format)
  color_bar = ColorBar(color_mapper=color_mapper, label_standoff=18, formatter=format_tick,
  border_line_color=None, location = (0, 0))

  # Create figure object.
  verbage = format_df.loc[format_df['field'] == field_name, 'verbage'].iloc[0]

  p = figure(title = verbage + ' by NYC Neighborhood', 
             plot_height = 650, plot_width = 850,
             toolbar_location = "below")
  p.xgrid.grid_line_color = None
  p.ygrid.grid_line_color = None
  p.axis.visible = False

  # Add patch renderer to figure. 
  p.patches('xs','ys', source = geosource, fill_color = {'field' : field_name, 'transform' : color_mapper},
          line_color = 'black', line_width = 0.25, fill_alpha = 1)
  
  # Specify color bar layout.
  p.add_layout(color_bar, 'right')

  # Add the hover tool to the graph
  p.add_tools(hover)
  return p
    
    
# Input geojson source that contains features for plotting for:
geosource = GeoJSONDataSource(geojson = json_data(nyc_data))
input_field = 'positive_cases'
# Define a sequential multi-hue color palette.
palette = brewer['Blues'][4]

# Reverse color order so that dark blue is highest COVID-19 cases.
palette = palette[::-1]

# Add hover tool
hover = HoverTool(tooltips = [ ('Neighborhood','@Neighborhood'),
                               ('# of Positive Cases', '@positive_cases'),
                               ('# of Drivers Needed', '@Drivers_Needed'),
                               ('Hospital(s)', '@Hospital')])

# Call the plotting function
p = make_plot(input_field)

# Make a selection object: select
select = Select(title='Select Criteria:', value='Number of Positive COVID-19 Cases', options=['Number of Drivers Needed'])
                                                                               
select.on_change('value', update_plot)

# Make a column layout of widgetbox(slider) and plot, and add it to the current document
# Display the current document
#layout = column(p, widgetbox(select), widgetbox(slider))
#curdoc().add_root(layout)

# Use the following code to test in a notebook
# Interactive features will not show in notebook
#output_notebook()
#show(p)