class ZipCodes(): """ Takes a dataframe with Latitude and longitude as its inputs. returns a dataframe with the corresponding zipcode for the given latitude and longitude """ def __init__(self, my_df, my_latitude, my_longitude): self.my_df = my_df self.my_latitude = my_latitude self.my_longitude = my_longitude self.search = SearchEngine(simple_zipcode=True) # Could we pass this function inside the get_zip_code_df function? # def get_zip_code_list(self): # self.zipcode, = self.search.by_coordinates(lat=self.my_latitude, lng=self.my_longitude, radius=2, returns=1 ) # self.zipcode = self.zipcode.to_dict() # self.zipcode = int(zipcode.get('zipcode')) # return self.zipcode def get_zip_code_df(self): for index, row in self.my_df.iterrows(): latitude = row.loc[self.my_latitude] longitude = row.loc[self.my_longitude] zipcode, = self.search.by_coordinates(lat=latitude, lng=longitude, radius=2, returns=1 ) zipcode = zipcode.to_dict() zipcode = int(zipcode.get('zipcode')) self.my_df.loc[index, 'zipcode'] = zipcode return(self.my_df)
def create_crime_zip_codes(crime_df): ''' Use the uszipcode library to identify a zip code for each unique pair of latitude and longitude coordinates in the Crime Dataset. Merges zip code information back into the Crime Dataset to later join with ACS data. NOTE: the uszipcode library can take a while to run (this is normal) Input: crime_df (dataframe): original crime dataframe Output: crime_df (dataframe): new crime dataframe including zip codes ''' crime_df.loc[:, 'latitude'] = crime_df.latitude.astype(float) crime_df.loc[:, 'longitude'] = crime_df.longitude.astype(float) truncated = crime_df[['block', 'latitude',\ 'longitude']].drop_duplicates(subset=['block']) truncated = truncated.dropna() search = SearchEngine(simple_zipcode=True) truncated['zip_code'] = truncated.apply(lambda x: search.by_coordinates( x['latitude'], x['longitude'])[0].zipcode, axis=1) merged_df = pd.merge(crime_df, truncated, on=['block', 'latitude',\ 'longitude'], how='left') merged_df.loc[:, 'zip_code'] = pd.to_numeric(merged_df['zip_code'], errors='coerce') return merged_df
def search_by_zipcode(self, category='Confirmed', zipcode="21029", radius=50): # df_conf = self.ds.dataSet[category] # date_cols = [c for c in df_conf.columns if '2020-' in c] search = SearchEngine() zipinfo = search.by_zipcode(zipcode) nei = search.by_coordinates(zipinfo.lat, zipinfo.lng, radius, sort_by='dist', ascending=True, returns=100) nei = list(set([(n.county, n.state) for n in nei])) nei_rec = [] for neib in nei: try: county = neib[0] if 'County' in county: county = county.split('County')[0].strip() state = us.states.lookup(neib[1]).name nei_rec.append((county, state)) # df_local = df_conf[(df_conf['County_Name']==county)&(df_conf['State_Name']==state)][date_cols] # if df_local.shape[0] > 0 and df_local.iloc[0,-1] > 0: # nei_rec['{},{}'.format(county,state)] = {'category':category,} except: pass return nei_rec # return a list of (county, state)
def get_zipcode(self, df): from uszipcode import SearchEngine search = SearchEngine(simple_zipcode=True) zipcode = search.by_coordinates(df['latitude'], df['longitude']) if not zipcode: return None else: return zipcode[0].values()[0]
def get_zip_code(latitude, longitude): try: search = SearchEngine(simple_zipcode=True) result = search.by_coordinates(latitude, longitude, radius=5, returns=1) return int(result[0].zipcode) except Exception as e: return 10001
def get_zip(airbnb): search = SearchEngine() zips = [] for row in airbnb.iterrows(): zipcode = search.by_coordinates(row[1]["latitude"], row[1]["longitude"], radius=2, returns=1) if (len(zipcode) > 0): zips.append(zipcode[0].zipcode) else: zips.append("-1") return zips
def _add_missing_zip_codes(self): ''' Uses coordinates to add missing zipcodes. ''' search = SearchEngine(simple_zipcode=True) for index, value in self.dataframe.iterrows(): zipcode = value['zip'] x = value['x'] y = value['y'] if zipcode == None: try: result = search.by_coordinates(y, x, radius=30, returns=1) self.dataframe['zip'][index] = result[0].zipcode except: pass
def zipcode(table, sc): search = SearchEngine() csvi = table.select("*").withColumn("id", monotonically_increasing_id()) list1 = [] for i in range(csvi.count()): lon1 = csvi.where( csvi.id == i).select('longitude').collect()[0]['longitude'] lon = float(lon1) lat1 = csvi.where( csvi.id == i).select('latitude').collect()[0]['latitude'] lat = float(lat1) zipcode = search.by_coordinates(lat, lon, radius=3) list1.append(zipcode[0].zipcode) row = Row("pid", "zipcode") new_df = sc.parallelize([row(i, list1[i]) for i in range(0, len(list1))]).toDF() station = csvi.join(new_df, csvi.id == new_df.pid).select("*") return station
def search(self, zip, range, results=0): # search = SearchEngine( simple_zipcode=True, db_file_dir="/tmp" ) # set simple_zipcode=False to use rich info database if not self.if_exists(zip): return '' zipcode = search.by_zipcode(zip) if VERBOSITY: print('ZIP search results:') print(zipcode) if VERBOSITY: print('Center coordiante: Lat:{} Long:{}'.format( zipcode.lat, zipcode.lng)) res = search.by_coordinates(zipcode.lat, zipcode.lng, radius=range, returns=results) return res
def _in_radius(self, row, rad): ''' The essential method for this class is _in_radius(). It returns the list of remaining zip codes and those that were filtered out as a tuple. As a rule of thumb use 50 radius, if you're scraping at 100 radius (units are in miles). ''' z = row try: search = SearchEngine(simple_zipcode=True) except NameError as error: # Output expected ImportErrors. print('NameError: {} module from {} package not found'.format( 'SearchEngine', 'uszipcode')) except Exception as exception: # Output unexpected Exceptions. print('Exception: {}'.format(exception)) zipcodes = [ i.zipcode for i in search.by_coordinates(lat=z.loc['lat'], lng=z.loc['lng'], radius=rad, zipcode_type='Standard', returns=100000) ] zipcodes = set(zipcodes) try: zipcodes.remove(z.loc['zip']) other = zipcodes unique = z except: # If the zip code is of type STANDARD they'll be covered by a different zip code. # The output provides indicators to the population of the zip code in question. faulty_zip = search.by_zipcode(z.zip) if faulty_zip.population or faulty_zip.land_area_in_sqmi: print( 'We couldn\'t remove {}. The zip code is of the type {} and the Population or Land Area is not None. Make sure this is ok. The zip code is kept.' .format(z.zip, z.type)) print('** Population: {} \n Land Area (in sqmi): {}'.format( faulty_zip.population, faulty_zip.land_area_in_sqmi)) other = set() unique = z else: other = set(z.zip) unique = pd.Series() return (unique, other)
def start(outfilename, limit=0): outfile = open(outfilename, 'w') count = 0 config = read_config() database_config = config['Database'] try: conn = psycopg2.connect(host=database_config['host_ip'], port=database_config['port'], database=database_config['database'], user=database_config['user'], password=database_config['password']) except Exception as e: print('Database could not be connected to:') print(e) return None cur = conn.cursor() # cur.execute("select * from \"Block_Group\" as bg where bg.city is null") sql_statement = "select bg.bg_geo_id, bg.longitude, bg.latitude from \"Block_Group\" as bg " \ "where bg.city is null order by bg.bg_geo_id" if limit > 0: sql_statement += " limit {}".format(limit) cur.execute(sql_statement) rows = cur.fetchall( ) #todo Combine all the long/lat into an array and call rg only once for row in rows: print(row) bg_geo = row[0] long = row[1] lat = row[2] result = rg.search((lat, long)) search = SearchEngine(simple_zipcode=True) zip = search.by_coordinates(lat, long, 30, returns=1) # print(zip) print(bg_geo, zip[0].zipcode, result[0]['name'], sep=",") print(bg_geo, zip[0].zipcode, result[0]["name"], sep=",", file=outfile) count += 1 if limit and (count >= limit): print("Limit reached") break outfile.close() return None
class ZipSearch: def __init__(self, simple=True): self.engine = SearchEngine(simple_zipcode=simple) def search(self, zipcode): data = self.engine.by_zipcode(zipcode) if data.lat and data.lng: return LatLongCoordinate(data.lat, data.lng) else: return CityStateSearch(engine=self.engine).search( data.major_city, data.state) def search_bulk(self, zipcodes): latlong_coordinates = [] for zc in zipcodes: latlong_coordinates.append(self.search(zc)) return latlong_coordinates def get_county(self, coordinates): zipcodes = self.engine.by_coordinates( coordinates.latitude, coordinates.longitude) c = Counter([z.county for z in zipcodes]) return c.most_common()[0][0]
import csv import geopy import datetime import shapely import numpy import copy from datetime import date from geopy import distance from shapely.geometry import Polygon, Point from numpy import arange from uszipcode import SearchEngine search = SearchEngine(simple_zipcode=False) #zipcode = search.by_state("Florida",returns=5000) zipcode = search.by_coordinates(28,-82,radius=1500,returns=50000) hurdatFile = open('weatherparsing/hurdat2.csv') windOutputFile = open('weatherparsing/hurWindZip2.csv','w') hurdatReader = csv.reader(hurdatFile) curStormName = 'Unknown' minLat = 20.0 maxLat = 40.0 minLon = -100.0 maxLon = -75.0 resolution = 0.3 output = [] for row in hurdatReader: if row[0].startswith("AL"): curStormName = row[1] else: curHur = {} curHur['stormName'] = curStormName.strip()
for i in range(0, len(tweets["distance_from_centeroid"])): tweets.ix[i,"distance_from_centeroid"] = tweets.ix[i,"distance_from_centeroid"][0] ''' ##GEOLOCATION #from uszipcode import Zipcode from uszipcode import SearchEngine search = SearchEngine(simple_zipcode=True) ''' tweets["zipcode"] = [search.by_coordinates(val1[1], val1[0], radius=r, returns=1) for val1 in tweets["location_centeroid"] for r in tweets["distance_from_centeroid"]] ''' final['language'] = tweets['language'] #final['country'] = tweets['country'] tweets["zipcode"] = [ search.by_coordinates(val1[1], val1[0], radius=4, returns=1) if val1 != None else None for val1 in tweets["location_centeroid"] ] lister1 = [] lister2 = [] tweets_zipcode = tweets["zipcode"] for ijk in range(tweets_zipcode.size): entry = tweets_zipcode[ijk] if entry != None and len(entry) > 0: lister1.append(entry[0].county) else: lister1.append([]) for ijk in range(tweets_zipcode.size): entry = tweets_zipcode[ijk]
import pandas as pd from uszipcode import SearchEngine from data_preprocess.taxi_zone_format.zipcode_info import get_zipcode_neighborhood from data_preprocess.taxi_zone_format.zone_distance import get_zone_centroid taxi_zones = pd.read_csv("../../data/trip_data/taxi_zones.csv") zipcode_neighborhood = get_zipcode_neighborhood() search = SearchEngine(simple_zipcode=True) zones_info = [] for index, row in taxi_zones.iterrows(): x, y = get_zone_centroid(row['the_geom']) results = search.by_coordinates(y, x) #33912 #10010 zip_code_info = '' for result in results: if result.state == 'NY': zip_code_info = result break elif row['borough'] == 'EWR' and result.state == 'NJ': zip_code_info = result break if zip_code_info: zone_info = { 'location_id': row['LocationID'], 'zone_name': row['zone'], 'borough': row['borough'],
zipc = row['zip'] if len(zipc) > 5: row['zip'] = zipc[:5] lat, lon = row['x'], row['y'] latf, lonf = True, True try: row['x'] = float(lat) except ValueError: latf = False try: row['y'] = float(lon) except ValueError: lonf = False if latf and lonf: try: result = search.by_coordinates(float(lon), float(lat), radius=30) except: print(lat, lon) if len(result) < 1: print("*", lon, lat) row["zip_from_xy"] = "lookup_zip_manually" else: row["zip_from_xy"] = result[0].zipcode else: row["zip_from_xy"] = "" #print(row) writer.writerow(row) line_count += 1 #if line_count == 6: # break
"year_built", "zipcode", "eui", "cvrmse", "nmbe", ], ) writer.writeheader() with open("./assets/sf_buildings_clean.csv") as f: reader = csv.DictReader(f) for row in reader: out = {} lat = float(row["lat"]) lng = float(row["lng"]) result = search.by_coordinates(lat, lng) out["lat"] = lat out["lng"] = lng out["zipcode"] = result[0].zipcode out["sqft"] = random.randint(1000, 30000) out["floors"] = random.randint(1, 15) out["year_built"] = random.randint(1900, 2000) out["eui"] = random.randint(0, 2000) out["cvrmse"] = random.randint(0, 10) out["nmbe"] = random.randint(0, 50) writer.writerow(out)
from uszipcode import SearchEngine search = SearchEngine(simple_zipcode=True) lat = 28.43180352 lng = -81.30852827 zipcode = search.by_coordinates(lat, lng, returns=1) print(zipcode[0].to_json())
import csv from uszipcode import SearchEngine, SimpleZipcode, Zipcode search = SearchEngine() with open('outputPostCode.csv', 'w') as f: #writer =csv.writer(f) with open('outputCleanData.csv', 'r') as csvFile: reader = csv.reader(csvFile) for row in reader: newstr1 = row[2].replace("'", "").strip() newstr2 = row[3].replace("'", "").strip() if newstr1 == "" or newstr2 == "": continue result = search.by_coordinates(float(newstr1), float(newstr2), radius=30, returns=1) for zipcode in result: print(newstr1, "and", newstr2, "and", zipcode.zipcode) row[4] = "'" + str(zipcode.zipcode) + "'" str1 = ','.join(row) f.write(str1 + "\n") csvFile.close() f.close()
def clean_col(data_clean_p1, col): for i in range(data_clean_p1.shape[0]): try: if np.isnan(data_clean_p1[col][i]) == True: data_clean_p1 = data_clean_p1.drop(i, axis=0) except: pass return data_clean_p1 for i in range(data_clean_p1.shape[0]): try: result = search.by_coordinates(data_clean_p1['Latitude'][i], data_clean_p1['Longitude'][i], radius=5, returns=1) array.append(result[0].zipcode) except: array.append('None') data_clean_p1['zipcode'] = array for i in range(data_clean_p1.shape[0]): ''' if data_clean_p1['Location'][i].isnull(): try: location = geolocator.geocode(str(data_clean_p1['Location'][i])) try: if (abs(location.latitude - float(data_clean_p1['Latitude'][i]))) > 3.0 or (abs(location.longitude - float(data_clean_p1['Longitude'][i]))) > 3.0: data_clean_p1.drop(data_clean_p1.index[i]) #time.sleep(10) except:
with open('zipcode.csv', mode='r', encoding='utf-8-sig') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') with open('newzipcode.csv', mode='w') as csv_file: fieldnames = ['latitude', 'longitude', 'zip_code'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() for row in readCSV: latitude = float(row[0]) longitude = float(row[1]) aggregate_total = int(row[2]) result = search.by_coordinates(latitude, longitude, radius=1) limit = 0 for zipcode in result: if (limit == 0): writer.writerow({ 'latitude': latitude, 'longitude': longitude, 'zip_code': zipcode.zipcode, 'aggregate_total': aggregate_total }) #print(row[0], row[1], zipcode.zipcode, aggregate_total) limit += 1 else: break #print(row[0], row[1], zipcode.zipcode)
def execute(trial = False): '''Transform waste data for city of Boston''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('misn15', 'misn15') hwgen = [] hwgen = repo['misn15.hwgen'].find() hwgen = copy.deepcopy(hwgen) aul = [] aul = repo['misn15.aul'].find() aul = copy.deepcopy(aul) waste = [] waste = repo['misn15.waste'].find() waste = copy.deepcopy(waste) # search for coordinates based on address geolocator = Nominatim(user_agent = "mis", timeout = 5) # get csr for coordinate search inProj = Proj(init='epsg:26986') outProj = Proj(init='epsg:4326') # project coordinates as US census tract number and zipcode search = SearchEngine(simple_zipcode=True) if trial: # filter hwgen hwgen_list = [] i = 0 for x in hwgen: if i < 5: hwgen_list += [[x['Name'], x['Address'], x['Town'], x['ZIP Code'], x['RCRA Gen Status']]] i += 1 else: break # get coordinates for hwgen for x in hwgen_list: full_address = str(x[1] + ' ' + x[2] + ' ' + 'MASSACHUSETTS') location = geolocator.geocode(full_address) if location is not None: x += [[location.longitude, location.latitude]] params = urllib.parse.urlencode({'latitude': location.latitude, 'longitude': location.longitude, 'format': 'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() geoid = data['Block']['FIPS'][0:11] x += [geoid] # get zipcodes in correct format for x in hwgen_list: if x[3][0] != '0': zipcode_num = '0' + x[3] elif len(x[3]) != 5: zipcode_num = x[3][0:5] if x[3][0] == 0 and x[3][1] == 0: zipcode_num = x[3][1:6] x[3] = zipcode_num # filter aul aul_list = [] i = 0 for x in aul: if i < 25: if x['properties']['TOWN'] == 'BOSTON': result = search.by_coordinates(x['geometry']['coordinates'][1], x['geometry']['coordinates'][0], returns=1) params = urllib.parse.urlencode({'latitude': x['geometry']['coordinates'][1], 'longitude': x['geometry']['coordinates'][0], 'format': 'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() result = result[0] geoid = data['Block']['FIPS'][0:11] aul_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'], result.zipcode, x['properties']['STATUS'], x['geometry']['coordinates'], geoid]] i += 1 else: break # filter waste waste_list = [] i = 0 for x in waste: if i < 25: if x['properties']['TOWN'] == 'BOSTON': long, lat = transform(inProj, outProj, x['geometry']['coordinates'][0], x['geometry']['coordinates'][1]) result = search.by_coordinates(lat, long, returns=1) result = result[0] params = urllib.parse.urlencode({'latitude': lat, 'longitude': long, 'format': 'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() geoid = data['Block']['FIPS'][0:11] waste_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'], result.zipcode, x['properties']['STATUS'], [long, lat], geoid]] i += 1 else: break # merge all waste data waste_all = hwgen_list + aul_list + waste_list else: # filter hwgen hwgen_list = [] for x in hwgen: hwgen_list += [[x['Name'], x['Address'], x['Town'], x['ZIP Code'], x['RCRA Gen Status']]] # get coordinates for hwgen for x in hwgen_list: full_address = str(x[1] + ' ' + x[2] + ' ' + 'MASSACHUSETTS') location = geolocator.geocode(full_address) if location is not None: x += [[location.longitude, location.latitude]] params = urllib.parse.urlencode( {'latitude': location.latitude, 'longitude': location.longitude, 'format': 'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() geoid = data['Block']['FIPS'][0:11] # get zipcodes in correct format for x in hwgen_list: if x[3][0] != '0': zipcode_num = '0' + x[3] elif len(x[3]) != 5: zipcode_num = x[3][0:5] if x[3][0] == 0 and x[3][1] == 0: zipcode_num = x[3][1:6] x[3] = zipcode_num # filter aul aul_list = [] for x in aul: if x['properties']['TOWN'] == 'BOSTON': result = search.by_coordinates(x['geometry']['coordinates'][1], x['geometry']['coordinates'][0], returns=1) params = urllib.parse.urlencode({'latitude': x['geometry']['coordinates'][1], 'longitude': x['geometry']['coordinates'][0], 'format': 'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() result = result[0] geoid = data['Block']['FIPS'][0:11] aul_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'], result.zipcode, x['properties']['STATUS'], x['geometry']['coordinates'], geoid]] # filter waste waste_list = [] for x in waste: if x['properties']['TOWN'] == 'BOSTON': long, lat = transform(inProj, outProj, x['geometry']['coordinates'][0], x['geometry']['coordinates'][1]) result = search.by_coordinates(lat, long, returns=1) result = result[0] params = urllib.parse.urlencode({'latitude': lat, 'longitude': long, 'format': 'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() geoid = data['Block']['FIPS'][0:11] waste_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'], result.zipcode, x['properties']['STATUS'], [round(long, 2), round(lat, 2)], geoid]] # merge all waste data waste_all = hwgen_list + aul_list + waste_list repo.dropCollection("misn15.waste_all") repo.createCollection("misn15.waste_all") for x in waste_all: entry = {'Name': x[0], 'Address': x[1], 'Zip Code': x[3], 'Coordinates': x[5], 'Status': x[4], 'FIPS': x[6]} repo['misn15.waste_all'].insert_one(entry) repo['misn15.waste_all'].metadata({'complete':True}) print(repo['misn15.waste_all'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start":startTime, "end":endTime}
def execute(trial = False): '''Transform waste data for city of Boston''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('misn15', 'misn15') oil = [] oil = repo['misn15.oil'].find() oil = copy.deepcopy(oil) #filter oil dataset oil_subset = [] for x in oil: if x['properties']['TOWN'] == 'BOSTON': oil_subset += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['geometry']['coordinates']]] # get csr for coordinate search inProj = Proj(init='epsg:26986') outProj = Proj(init='epsg:4326') # project coordinates as US census tract number and zipcode search = SearchEngine(simple_zipcode=True) for x in oil_subset: long, lat = transform(inProj, outProj, x[2][0], x[2][1]) x[2][0] = round(long, 2) x[2][1] = round(lat, 2) result = search.by_coordinates(lat, long, returns = 1) result = result[0] x += [result.zipcode] # get FIPS census tract number using coordinates for x in oil_subset: lat = x[2][1] lon = x[2][0] params = urllib.parse.urlencode({'latitude': lat, 'longitude':lon, 'format':'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() geoid = data['Block']['FIPS'][0:11] x += [geoid] waste = [] waste = repo['misn15.waste'].find() waste = copy.deepcopy(waste) waste_list = [] for x in waste: waste_list += [[x['Name'], x['Address'], [0, 0], x['ZIP Code']]] # get coordinates and fips for waste data for x in waste_list: if x[3][0] != '0': zipcode_num = '0' + x[3] elif len(x[3]) != 5: zipcode_num = x[3][0:5] if x[3][0] == 0 and x[3][1] == 0: zipcode_num = x[3][1:6] zipcode = zipcodes.matching(zipcode_num) if len(zipcode) != 0: x[2][0] = zipcode[0]['long'] x[2][1] = zipcode[0]['lat'] # get FIPS census tract number for waste data for x in waste_list: lat = x[2][1] lon = x[2][0] if lat != 0: params = urllib.parse.urlencode({'latitude': lat, 'longitude':lon, 'format':'json'}) url = 'https://geo.fcc.gov/api/census/block/find?' + params response = requests.get(url) data = response.json() geoid2 = data['Block']['FIPS'][0:11] x += [geoid2] else: x += ['0'] # merge oil sites with hazardous waste sites waste_merged = oil_subset + waste_list repo.dropCollection("misn15.waste_merged") repo.createCollection("misn15.waste_merged") for x in waste_merged: entry = {'Name': x[0], 'Address': x[1], 'Coordinates': x[2], 'Zip Code': x[3], 'FIPS': x[4]} repo['misn15.waste_merged'].insert_one(entry) repo['misn15.waste_merged'].metadata({'complete':True}) print(repo['misn15.waste_merged'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start":startTime, "end":endTime}
def execute(trial=False): '''Transform health and waste data for city of Boston''' startTime = datetime.datetime.now() # Set up the database connection. client = dml.pymongo.MongoClient() repo = client.repo repo.authenticate('misn15', 'misn15') # make deepcopy of data waste = [] waste = repo['misn15.waste'].find() waste = copy.deepcopy(waste) health = [] health = repo['misn15.health'].find() health = copy.deepcopy(health) health_pd = pd.DataFrame(health) health_pd = health_pd[pd.isnull(health_pd['tractfips']) == False] health_pd = health_pd[pd.isnull(health_pd['data_value']) == False] #filter health health_subset = health_pd[[ "category", "measure", "data_value_unit", "data_value", "categoryid", "geolocation", "measureid", "short_question_text", "tractfips" ]] # convert to a list health_list = [] for x in range(len(health_subset)): health_list += [list(health_subset.iloc[x, :])] # add zipcodes search = SearchEngine(simple_zipcode=True) for x in health_list: long = round(x[5]["coordinates"][0], 2) lat = round(x[5]["coordinates"][1], 2) result = search.by_coordinates(lat, long, returns=1) result = result[0] x += [result.zipcode] # get the product of waste and health data sets and project to product = [] for row in waste: for i in health_list: product += [[row, i]] projection = [(x[0]['Name'], x[0]['Address'], x[1][0], x[1][3], x[1][6], x[1][-1]) for x in product if '0' + str(x[0]['ZIP Code']) == x[1][-1]] # filter out prevention; we only want actual illness no_prev = [x for x in projection if x[2] != "Prevention"] #get all the different types of illnesses keys = [] for x in no_prev: keys += [x[4]] keys = set(keys) # append a dictionary of all illnesses and prevalence rates for every waste site agg_health = [] prev = '' dict_disease = {} dict_disease[no_prev[0][4]] = [no_prev[0][3]] for x in no_prev: if x[0] == prev and x[4] not in dict_disease.keys(): dict_disease[x[4]] = [x[3]] prev = x[0] elif x[0] == prev and x[4] in dict_disease.keys(): dict_disease[x[4]] += [x[3]] prev = x[0] else: #print(dict_disease) agg_health += [[x[0], x[1], x[5], dict_disease]] #agg_health += [dict_disease] dict_disease = {} dict_disease[x[4]] = [x[3]] prev = x[0] repo.dropCollection("misn15.waste_health") repo.createCollection("misn15.waste_health") for x in agg_health: entry = { 'Name': x[0], "Address": x[1], "Zip code": x[2], "Disease": x[3] } repo['misn15.waste_health'].insert_one(entry) repo['misn15.waste_health'].metadata({'complete': True}) print(repo['misn15.waste_health'].metadata()) repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
matching_city = (matching_zip["county"] + ", " + matching_zip['state']) #Let the users know their input with a pop-up window sg.Popup( 'Input Verification', "So you will be studying or working in " + matching_city + ". And you are willing to commute up to " + str(miles_to_commute) + " miles. You want a " + housing_style + " and your budget is " + budget + ". Click Ok to Proceed...") #using built-in data within uszipcode package to determine latitude and longtitude of the target zipcode/location latitude = matching_zip["lat"] longtitude = matching_zip["lng"] #using a built-in function within uszipcode package to search nearby neighborhodds, setting maximum 10 neighborhoods for now to shorten search time nearby_neighborhoods = search.by_coordinates(latitude, longtitude, radius=int(miles_to_commute), returns=10) where_to_live = [] housing_listing = [] print("Below is a list of neighborhoods you can consider: \n") #search and append the results for i in range(0, len(nearby_neighborhoods)): city = nearby_neighborhoods[i].post_office_city if city not in where_to_live: where_to_live.append(city) if housing_style == "studio": housing_listing.append( nearby_neighborhoods[i]. monthly_rent_including_utilities_studio_apt)
notfound = 0 latmisscount = 0 nazipcount = 0 search = SearchEngine(simple_zipcode=True) for rowcount in range(0, len(df)): flag = 0 stateflag = 0 # if latitude/logitude is missing then zipcalc will be empty if (pd.isnull(df['y'][rowcount]) or pd.isnull(df['x'][rowcount])): latmisscount = latmisscount + 1 zipcalc.append("") # executes below when latitude/logitude is not empty else: # gets 30 mile radius list of zip codes data = search.by_coordinates(df['y'][rowcount], df['x'][rowcount], radius=50, returns=10) # if actual zipcode is empty count this as mismatch if (pd.isnull(df.zip[rowcount])): nazipcount = nazipcount + 1 notfound = notfound + 1 #zipcalc.append(df.zip[rowcount]) # is zipcode is empty and database was not able to find any zipcodes leave the zipcalc to empty if (len(data) == 0): zipcalc.append("") else: # if zipcode is empty but database is not then set zipcalc to closest zipcode zipcalc.append(data[0].zipcode) else: #if zipcode is found in database then loop through data t osee if zipcode is in 30mile radius of datbase data if (len(data) > 0):
['RIT-MANHATTAN', (43.0844, -77.6749)], ['RIT-ROOSEVELT', (43.0844, -77.6749)]] # In[58]: station_lat_long = lat_long + missing # In[60]: # search latitude and longitude to get zipcode, population, population density, median household income search = SearchEngine(simple_zipcode=True) zip_list = [] for i in range(len(station_lat_long)): try: zipcode = search.by_coordinates(station_lat_long[i][1][0], station_lat_long[i][1][1], returns=1) zip_list.append([ station_lat_long[i][0], station_lat_long[i][1][0], station_lat_long[i][1][1], zipcode[0].zipcode, zipcode[0].population, zipcode[0].population_density, zipcode[0].median_household_income ]) # see if there is no match except: print(station_lat_long[i][0]) # In[61]: # make the list into a dataframe station_zip_demo = pd.DataFrame(zip_list,