Esempio n. 1
0
class ZipCodes():
    """
    Takes a dataframe with Latitude and longitude as its inputs. 

    returns a dataframe with the corresponding zipcode for the given latitude and longitude

    """
    def __init__(self, my_df, my_latitude, my_longitude):
        self.my_df = my_df
        self.my_latitude = my_latitude
        self.my_longitude = my_longitude
        self.search = SearchEngine(simple_zipcode=True)

    # Could we pass this function inside the get_zip_code_df function?
    # def get_zip_code_list(self):
    #     self.zipcode,  = self.search.by_coordinates(lat=self.my_latitude, lng=self.my_longitude, radius=2, returns=1 )
    #     self.zipcode = self.zipcode.to_dict()
    #     self.zipcode = int(zipcode.get('zipcode'))
    #     return self.zipcode

    def get_zip_code_df(self):
        for index, row in self.my_df.iterrows():
            latitude = row.loc[self.my_latitude]
            longitude = row.loc[self.my_longitude]
            zipcode,  = self.search.by_coordinates(lat=latitude, lng=longitude, radius=2, returns=1 ) 
            zipcode = zipcode.to_dict()
            zipcode = int(zipcode.get('zipcode'))
            self.my_df.loc[index, 'zipcode'] = zipcode

        return(self.my_df)
Esempio n. 2
0
def create_crime_zip_codes(crime_df):
    '''
    Use the uszipcode library to identify a zip code for each unique
    pair of latitude and longitude coordinates in the Crime Dataset.
    Merges zip code information back into the Crime Dataset to later join with
    ACS data.

    NOTE: the uszipcode library can take a while to run (this is normal)

    Input:
        crime_df (dataframe): original crime dataframe

    Output:
        crime_df (dataframe): new crime dataframe including zip codes
    '''
    crime_df.loc[:, 'latitude'] = crime_df.latitude.astype(float)
    crime_df.loc[:, 'longitude'] = crime_df.longitude.astype(float)
    truncated = crime_df[['block', 'latitude',\
                         'longitude']].drop_duplicates(subset=['block'])
    truncated = truncated.dropna()
    search = SearchEngine(simple_zipcode=True)
    truncated['zip_code'] = truncated.apply(lambda x: search.by_coordinates(
        x['latitude'], x['longitude'])[0].zipcode,
                                            axis=1)
    merged_df = pd.merge(crime_df, truncated, on=['block', 'latitude',\
                                                  'longitude'], how='left')
    merged_df.loc[:, 'zip_code'] = pd.to_numeric(merged_df['zip_code'],
                                                 errors='coerce')

    return merged_df
Esempio n. 3
0
    def search_by_zipcode(self,
                          category='Confirmed',
                          zipcode="21029",
                          radius=50):

        # df_conf = self.ds.dataSet[category]

        # date_cols = [c for c in df_conf.columns if '2020-' in c]
        search = SearchEngine()
        zipinfo = search.by_zipcode(zipcode)
        nei = search.by_coordinates(zipinfo.lat,
                                    zipinfo.lng,
                                    radius,
                                    sort_by='dist',
                                    ascending=True,
                                    returns=100)
        nei = list(set([(n.county, n.state) for n in nei]))
        nei_rec = []
        for neib in nei:
            try:
                county = neib[0]
                if 'County' in county:
                    county = county.split('County')[0].strip()
                state = us.states.lookup(neib[1]).name
                nei_rec.append((county, state))
                # df_local = df_conf[(df_conf['County_Name']==county)&(df_conf['State_Name']==state)][date_cols]
                # if df_local.shape[0] > 0 and df_local.iloc[0,-1] > 0:
                #     nei_rec['{},{}'.format(county,state)] = {'category':category,}
            except:
                pass
        return nei_rec  # return a list of (county, state)
Esempio n. 4
0
 def get_zipcode(self, df):
     from uszipcode import SearchEngine
     search = SearchEngine(simple_zipcode=True)
     zipcode = search.by_coordinates(df['latitude'], df['longitude'])
     if not zipcode:
         return None
     else:
         return zipcode[0].values()[0]
def get_zip_code(latitude, longitude):
    try:
        search = SearchEngine(simple_zipcode=True)
        result = search.by_coordinates(latitude,
                                       longitude,
                                       radius=5,
                                       returns=1)
        return int(result[0].zipcode)
    except Exception as e:
        return 10001
def get_zip(airbnb):
    search = SearchEngine()
    zips = []
    for row in airbnb.iterrows():
        zipcode = search.by_coordinates(row[1]["latitude"],
                                        row[1]["longitude"],
                                        radius=2,
                                        returns=1)
        if (len(zipcode) > 0):
            zips.append(zipcode[0].zipcode)
        else:
            zips.append("-1")
    return zips
Esempio n. 7
0
    def _add_missing_zip_codes(self):
        '''
        Uses coordinates to add missing zipcodes.
        '''

        search = SearchEngine(simple_zipcode=True)
        for index, value in self.dataframe.iterrows():
            zipcode = value['zip']
            x = value['x']
            y = value['y']
            if zipcode == None:
                try:
                    result = search.by_coordinates(y, x, radius=30, returns=1)
                    self.dataframe['zip'][index] = result[0].zipcode
                except:
                    pass
def zipcode(table, sc):
    search = SearchEngine()
    csvi = table.select("*").withColumn("id", monotonically_increasing_id())
    list1 = []
    for i in range(csvi.count()):
        lon1 = csvi.where(
            csvi.id == i).select('longitude').collect()[0]['longitude']
        lon = float(lon1)
        lat1 = csvi.where(
            csvi.id == i).select('latitude').collect()[0]['latitude']
        lat = float(lat1)
        zipcode = search.by_coordinates(lat, lon, radius=3)
        list1.append(zipcode[0].zipcode)
    row = Row("pid", "zipcode")
    new_df = sc.parallelize([row(i, list1[i])
                             for i in range(0, len(list1))]).toDF()
    station = csvi.join(new_df, csvi.id == new_df.pid).select("*")
    return station
Esempio n. 9
0
 def search(self, zip, range, results=0):
     #
     search = SearchEngine(
         simple_zipcode=True, db_file_dir="/tmp"
     )  # set simple_zipcode=False to use rich info database
     if not self.if_exists(zip):
         return ''
     zipcode = search.by_zipcode(zip)
     if VERBOSITY:
         print('ZIP search results:')
         print(zipcode)
     if VERBOSITY:
         print('Center coordiante: Lat:{} Long:{}'.format(
             zipcode.lat, zipcode.lng))
     res = search.by_coordinates(zipcode.lat,
                                 zipcode.lng,
                                 radius=range,
                                 returns=results)
     return res
Esempio n. 10
0
 def _in_radius(self, row, rad):
     '''
     The essential method for this class is _in_radius(). It returns the list of remaining zip codes and those that were filtered out as a tuple.
     As a rule of thumb use 50 radius, if you're scraping at 100 radius (units are in miles).
     '''
     z = row
     try:
         search = SearchEngine(simple_zipcode=True)
     except NameError as error:
         # Output expected ImportErrors.
         print('NameError: {} module from {} package not found'.format(
             'SearchEngine', 'uszipcode'))
     except Exception as exception:
         # Output unexpected Exceptions.
         print('Exception: {}'.format(exception))
     zipcodes = [
         i.zipcode for i in search.by_coordinates(lat=z.loc['lat'],
                                                  lng=z.loc['lng'],
                                                  radius=rad,
                                                  zipcode_type='Standard',
                                                  returns=100000)
     ]
     zipcodes = set(zipcodes)
     try:
         zipcodes.remove(z.loc['zip'])
         other = zipcodes
         unique = z
     except:
         # If the zip code is of type STANDARD they'll be covered by a different zip code.
         # The output provides indicators to the population of the zip code in question.
         faulty_zip = search.by_zipcode(z.zip)
         if faulty_zip.population or faulty_zip.land_area_in_sqmi:
             print(
                 'We couldn\'t remove {}. The zip code is of the type {} and the Population or Land Area is not None. Make sure this is ok. The zip code is kept.'
                 .format(z.zip, z.type))
             print('** Population: {} \n Land Area (in sqmi): {}'.format(
                 faulty_zip.population, faulty_zip.land_area_in_sqmi))
             other = set()
             unique = z
         else:
             other = set(z.zip)
             unique = pd.Series()
     return (unique, other)
Esempio n. 11
0
def start(outfilename, limit=0):
    outfile = open(outfilename, 'w')
    count = 0
    config = read_config()
    database_config = config['Database']
    try:
        conn = psycopg2.connect(host=database_config['host_ip'],
                                port=database_config['port'],
                                database=database_config['database'],
                                user=database_config['user'],
                                password=database_config['password'])
    except Exception as e:
        print('Database could not be connected to:')
        print(e)
        return None
    cur = conn.cursor()
    # cur.execute("select * from \"Block_Group\" as bg where bg.city is null")
    sql_statement = "select bg.bg_geo_id, bg.longitude, bg.latitude from \"Block_Group\" as bg " \
                    "where bg.city is null order by bg.bg_geo_id"
    if limit > 0:
        sql_statement += " limit {}".format(limit)
    cur.execute(sql_statement)
    rows = cur.fetchall(
    )  #todo Combine all the long/lat into an array and call rg only once
    for row in rows:
        print(row)
        bg_geo = row[0]
        long = row[1]
        lat = row[2]
        result = rg.search((lat, long))
        search = SearchEngine(simple_zipcode=True)
        zip = search.by_coordinates(lat, long, 30, returns=1)
        # print(zip)
        print(bg_geo, zip[0].zipcode, result[0]['name'], sep=",")
        print(bg_geo, zip[0].zipcode, result[0]["name"], sep=",", file=outfile)
        count += 1
        if limit and (count >= limit):
            print("Limit reached")
            break
    outfile.close()
    return None
Esempio n. 12
0
class ZipSearch:

    def __init__(self, simple=True):
        self.engine = SearchEngine(simple_zipcode=simple)

    def search(self, zipcode):
        data = self.engine.by_zipcode(zipcode)
        if data.lat and data.lng:
            return LatLongCoordinate(data.lat, data.lng)
        else:
            return CityStateSearch(engine=self.engine).search(
                data.major_city, data.state)

    def search_bulk(self, zipcodes):
        latlong_coordinates = []
        for zc in zipcodes:
            latlong_coordinates.append(self.search(zc))
        return latlong_coordinates

    def get_county(self, coordinates):
        zipcodes = self.engine.by_coordinates(
            coordinates.latitude, coordinates.longitude)
        c = Counter([z.county for z in zipcodes])
        return c.most_common()[0][0]
Esempio n. 13
0
import csv
import geopy
import datetime
import shapely
import numpy
import copy
from datetime import date
from geopy import distance
from shapely.geometry import Polygon, Point
from numpy import arange
from uszipcode import SearchEngine

search = SearchEngine(simple_zipcode=False)
#zipcode = search.by_state("Florida",returns=5000)
zipcode = search.by_coordinates(28,-82,radius=1500,returns=50000)
hurdatFile = open('weatherparsing/hurdat2.csv')
windOutputFile = open('weatherparsing/hurWindZip2.csv','w')
hurdatReader = csv.reader(hurdatFile)
curStormName = 'Unknown'
minLat = 20.0
maxLat = 40.0
minLon = -100.0
maxLon = -75.0
resolution = 0.3
output = []
for row in hurdatReader:
        if row[0].startswith("AL"):
            curStormName = row[1]
        else:
            curHur = {}
            curHur['stormName'] = curStormName.strip()
for i in range(0, len(tweets["distance_from_centeroid"])):
    tweets.ix[i,"distance_from_centeroid"] = tweets.ix[i,"distance_from_centeroid"][0]
 '''

##GEOLOCATION
#from uszipcode import Zipcode
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
'''
tweets["zipcode"] = [search.by_coordinates(val1[1], val1[0], radius=r, returns=1) 
                     for val1 in tweets["location_centeroid"] for r in tweets["distance_from_centeroid"]]
'''
final['language'] = tweets['language']
#final['country'] = tweets['country']
tweets["zipcode"] = [
    search.by_coordinates(val1[1], val1[0], radius=4, returns=1)
    if val1 != None else None for val1 in tweets["location_centeroid"]
]
lister1 = []
lister2 = []
tweets_zipcode = tweets["zipcode"]

for ijk in range(tweets_zipcode.size):
    entry = tweets_zipcode[ijk]
    if entry != None and len(entry) > 0:
        lister1.append(entry[0].county)
    else:
        lister1.append([])

for ijk in range(tweets_zipcode.size):
    entry = tweets_zipcode[ijk]
Esempio n. 15
0
import pandas as pd
from uszipcode import SearchEngine
from data_preprocess.taxi_zone_format.zipcode_info import get_zipcode_neighborhood
from data_preprocess.taxi_zone_format.zone_distance import get_zone_centroid

taxi_zones = pd.read_csv("../../data/trip_data/taxi_zones.csv")
zipcode_neighborhood = get_zipcode_neighborhood()

search = SearchEngine(simple_zipcode=True)
zones_info = []

for index, row in taxi_zones.iterrows():
    x, y = get_zone_centroid(row['the_geom'])
    results = search.by_coordinates(y, x)
    #33912 #10010
    zip_code_info = ''
    for result in results:
        if result.state == 'NY':
            zip_code_info = result
            break
        elif row['borough'] == 'EWR' and result.state == 'NJ':
            zip_code_info = result
            break
    if zip_code_info:
        zone_info = {
            'location_id':
            row['LocationID'],
            'zone_name':
            row['zone'],
            'borough':
            row['borough'],
Esempio n. 16
0
    zipc = row['zip']
    if len(zipc) > 5:
        row['zip'] = zipc[:5]
    lat, lon = row['x'], row['y']
    latf, lonf = True, True
    try:
        row['x'] = float(lat)
    except ValueError:
        latf = False
    try:
        row['y'] = float(lon)
    except ValueError:
        lonf = False
    if latf and lonf:
        try:
            result = search.by_coordinates(float(lon), float(lat), radius=30)
        except:
            print(lat, lon)
        if len(result) < 1:
            print("*", lon, lat)
            row["zip_from_xy"] = "lookup_zip_manually"
        else:
            row["zip_from_xy"] = result[0].zipcode
    else:
        row["zip_from_xy"] = ""

            #print(row)
    writer.writerow(row)
    line_count += 1
            #if line_count == 6:
            #    break
            "year_built",
            "zipcode",
            "eui",
            "cvrmse",
            "nmbe",
        ],
    )

    writer.writeheader()
    with open("./assets/sf_buildings_clean.csv") as f:
        reader = csv.DictReader(f)
        for row in reader:
            out = {}

            lat = float(row["lat"])
            lng = float(row["lng"])
            result = search.by_coordinates(lat, lng)
            out["lat"] = lat
            out["lng"] = lng
            out["zipcode"] = result[0].zipcode

            out["sqft"] = random.randint(1000, 30000)
            out["floors"] = random.randint(1, 15)
            out["year_built"] = random.randint(1900, 2000)

            out["eui"] = random.randint(0, 2000)
            out["cvrmse"] = random.randint(0, 10)
            out["nmbe"] = random.randint(0, 50)

            writer.writerow(out)
Esempio n. 18
0
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
lat = 28.43180352
lng = -81.30852827
zipcode = search.by_coordinates(lat, lng, returns=1)
print(zipcode[0].to_json())
import csv
from uszipcode import SearchEngine, SimpleZipcode, Zipcode
search = SearchEngine()
with open('outputPostCode.csv', 'w') as f:
    #writer =csv.writer(f)
    with open('outputCleanData.csv', 'r') as csvFile:
        reader = csv.reader(csvFile)
        for row in reader:
            newstr1 = row[2].replace("'", "").strip()
            newstr2 = row[3].replace("'", "").strip()
            if newstr1 == "" or newstr2 == "":
                continue
            result = search.by_coordinates(float(newstr1),
                                           float(newstr2),
                                           radius=30,
                                           returns=1)
            for zipcode in result:
                print(newstr1, "and", newstr2, "and", zipcode.zipcode)
                row[4] = "'" + str(zipcode.zipcode) + "'"
                str1 = ','.join(row)
                f.write(str1 + "\n")
    csvFile.close()
f.close()

def clean_col(data_clean_p1, col):
    for i in range(data_clean_p1.shape[0]):
        try:
            if np.isnan(data_clean_p1[col][i]) == True:
                data_clean_p1 = data_clean_p1.drop(i, axis=0)
        except:
            pass
    return data_clean_p1


for i in range(data_clean_p1.shape[0]):
    try:
        result = search.by_coordinates(data_clean_p1['Latitude'][i],
                                       data_clean_p1['Longitude'][i],
                                       radius=5,
                                       returns=1)
        array.append(result[0].zipcode)
    except:
        array.append('None')
data_clean_p1['zipcode'] = array
for i in range(data_clean_p1.shape[0]):
    '''
    if data_clean_p1['Location'][i].isnull():
        try:
            location = geolocator.geocode(str(data_clean_p1['Location'][i]))
            try:
                if (abs(location.latitude - float(data_clean_p1['Latitude'][i]))) > 3.0 or (abs(location.longitude - float(data_clean_p1['Longitude'][i]))) > 3.0:
                    data_clean_p1.drop(data_clean_p1.index[i])
                #time.sleep(10)
            except:
Esempio n. 21
0
with open('zipcode.csv', mode='r', encoding='utf-8-sig') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')

    with open('newzipcode.csv', mode='w') as csv_file:
        fieldnames = ['latitude', 'longitude', 'zip_code']
        writer = csv.DictWriter(csv_file,
                                fieldnames=fieldnames,
                                extrasaction='ignore')

        writer.writeheader()

        for row in readCSV:
            latitude = float(row[0])
            longitude = float(row[1])
            aggregate_total = int(row[2])
            result = search.by_coordinates(latitude, longitude, radius=1)
            limit = 0
            for zipcode in result:
                if (limit == 0):
                    writer.writerow({
                        'latitude': latitude,
                        'longitude': longitude,
                        'zip_code': zipcode.zipcode,
                        'aggregate_total': aggregate_total
                    })
                    #print(row[0], row[1], zipcode.zipcode, aggregate_total)
                    limit += 1
                else:
                    break
                #print(row[0], row[1], zipcode.zipcode)
    def execute(trial = False):
        '''Transform waste data for city of Boston'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('misn15', 'misn15')

        hwgen = []
        hwgen = repo['misn15.hwgen'].find()
        hwgen = copy.deepcopy(hwgen)

        aul = []
        aul = repo['misn15.aul'].find()
        aul = copy.deepcopy(aul)

        waste = []
        waste = repo['misn15.waste'].find()
        waste = copy.deepcopy(waste)

        # search for coordinates based on address
        geolocator = Nominatim(user_agent = "mis", timeout = 5)

        # get csr for coordinate search
        inProj = Proj(init='epsg:26986')
        outProj = Proj(init='epsg:4326')

        # project coordinates as US census tract number and zipcode
        search = SearchEngine(simple_zipcode=True)

        if trial:
            # filter hwgen
            hwgen_list = []
            i = 0
            for x in hwgen:
                if i < 5:
                    hwgen_list += [[x['Name'], x['Address'], x['Town'], x['ZIP Code'], x['RCRA Gen Status']]]
                    i += 1
                else:
                    break
            
            # get coordinates for hwgen
            for x in hwgen_list:
                full_address = str(x[1] + ' ' + x[2] + ' ' + 'MASSACHUSETTS')
                location = geolocator.geocode(full_address)
                if location is not None:
                    x += [[location.longitude, location.latitude]]
                    params = urllib.parse.urlencode({'latitude': location.latitude, 'longitude': location.longitude, 'format': 'json'})
                    url = 'https://geo.fcc.gov/api/census/block/find?' + params
                    response = requests.get(url)
                    data = response.json()
                    geoid = data['Block']['FIPS'][0:11]
                    x += [geoid]

            # get zipcodes in correct format
            for x in hwgen_list:
                if x[3][0] != '0':
                    zipcode_num = '0' + x[3]
                elif len(x[3]) != 5:
                    zipcode_num = x[3][0:5]
                if x[3][0] == 0 and x[3][1] == 0:
                    zipcode_num = x[3][1:6]
                x[3] = zipcode_num

            # filter aul
            aul_list = []
            i = 0
            for x in aul:
                if i < 25:
                    if x['properties']['TOWN'] == 'BOSTON':
                        result = search.by_coordinates(x['geometry']['coordinates'][1], x['geometry']['coordinates'][0], returns=1)
                        params = urllib.parse.urlencode({'latitude': x['geometry']['coordinates'][1], 'longitude': x['geometry']['coordinates'][0], 'format': 'json'})
                        url = 'https://geo.fcc.gov/api/census/block/find?' + params
                        response = requests.get(url)
                        data = response.json()
                        result = result[0]
                        geoid = data['Block']['FIPS'][0:11]
                        aul_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'],
                                      result.zipcode, x['properties']['STATUS'], x['geometry']['coordinates'], geoid]]
                        i += 1
                else:
                    break

            # filter waste
            waste_list = []
            i = 0
            for x in waste:
                if i < 25:
                    if x['properties']['TOWN'] == 'BOSTON':
                        long, lat = transform(inProj, outProj, x['geometry']['coordinates'][0], x['geometry']['coordinates'][1])
                        result = search.by_coordinates(lat, long, returns=1)
                        result = result[0]
                        params = urllib.parse.urlencode({'latitude': lat, 'longitude': long, 'format': 'json'})
                        url = 'https://geo.fcc.gov/api/census/block/find?' + params
                        response = requests.get(url)
                        data = response.json()
                        geoid = data['Block']['FIPS'][0:11]
                        waste_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'],
                                        result.zipcode, x['properties']['STATUS'], [long, lat], geoid]]
                        i += 1
                else:
                    break

            # merge all waste data
            waste_all = hwgen_list + aul_list + waste_list

        else:
            # filter hwgen
            hwgen_list = []
            for x in hwgen:
                hwgen_list += [[x['Name'], x['Address'], x['Town'], x['ZIP Code'], x['RCRA Gen Status']]]

            # get coordinates for hwgen
            for x in hwgen_list:
                full_address = str(x[1] + ' ' + x[2] + ' ' + 'MASSACHUSETTS')
                location = geolocator.geocode(full_address)
                if location is not None:
                    x += [[location.longitude, location.latitude]]
                    params = urllib.parse.urlencode(
                        {'latitude': location.latitude, 'longitude': location.longitude, 'format': 'json'})
                    url = 'https://geo.fcc.gov/api/census/block/find?' + params
                    response = requests.get(url)
                    data = response.json()
                    geoid = data['Block']['FIPS'][0:11]

            # get zipcodes in correct format
            for x in hwgen_list:
                if x[3][0] != '0':
                    zipcode_num = '0' + x[3]
                elif len(x[3]) != 5:
                    zipcode_num = x[3][0:5]
                if x[3][0] == 0 and x[3][1] == 0:
                    zipcode_num = x[3][1:6]
                x[3] = zipcode_num

            # filter aul
            aul_list = []
            for x in aul:
                if x['properties']['TOWN'] == 'BOSTON':
                    result = search.by_coordinates(x['geometry']['coordinates'][1], x['geometry']['coordinates'][0], returns=1)
                    params = urllib.parse.urlencode({'latitude': x['geometry']['coordinates'][1],
                                                     'longitude': x['geometry']['coordinates'][0], 'format': 'json'})
                    url = 'https://geo.fcc.gov/api/census/block/find?' + params
                    response = requests.get(url)
                    data = response.json()
                    result = result[0]
                    geoid = data['Block']['FIPS'][0:11]
                    aul_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'],
                                  result.zipcode, x['properties']['STATUS'], x['geometry']['coordinates'], geoid]]

            # filter waste
            waste_list = []
            for x in waste:
                if x['properties']['TOWN'] == 'BOSTON':
                    long, lat = transform(inProj, outProj, x['geometry']['coordinates'][0], x['geometry']['coordinates'][1])
                    result = search.by_coordinates(lat, long, returns=1)
                    result = result[0]
                    params = urllib.parse.urlencode({'latitude': lat, 'longitude': long, 'format': 'json'})
                    url = 'https://geo.fcc.gov/api/census/block/find?' + params
                    response = requests.get(url)
                    data = response.json()
                    geoid = data['Block']['FIPS'][0:11]
                    waste_list += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['properties']['TOWN'],
                                    result.zipcode, x['properties']['STATUS'], [round(long, 2), round(lat, 2)], geoid]]

            # merge all waste data
            waste_all = hwgen_list + aul_list + waste_list

        repo.dropCollection("misn15.waste_all")
        repo.createCollection("misn15.waste_all")

        for x in waste_all:
            entry = {'Name': x[0], 'Address': x[1], 'Zip Code': x[3], 'Coordinates': x[5], 'Status': x[4], 'FIPS': x[6]}
            repo['misn15.waste_all'].insert_one(entry)

        repo['misn15.waste_all'].metadata({'complete':True})
        print(repo['misn15.waste_all'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start":startTime, "end":endTime}
    def execute(trial = False):
        '''Transform waste data for city of Boston'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('misn15', 'misn15')

        oil = []
        oil = repo['misn15.oil'].find()
        oil = copy.deepcopy(oil)

        #filter oil dataset
        oil_subset = []
        for x in oil:
            if x['properties']['TOWN'] == 'BOSTON':
                oil_subset += [[x['properties']['NAME'], x['properties']['ADDRESS'], x['geometry']['coordinates']]]

        # get csr for coordinate search
        inProj = Proj(init='epsg:26986')
        outProj = Proj(init='epsg:4326')
        

        # project coordinates as US census tract number and zipcode
        search = SearchEngine(simple_zipcode=True)
        for x in oil_subset:
            long, lat = transform(inProj, outProj, x[2][0], x[2][1])
            x[2][0] = round(long, 2)
            x[2][1] = round(lat, 2)
            result = search.by_coordinates(lat, long, returns = 1)
            result = result[0]
            x += [result.zipcode]

        # get FIPS census tract number using coordinates
        for x in oil_subset:
            lat = x[2][1]
            lon = x[2][0]
            params = urllib.parse.urlencode({'latitude': lat, 'longitude':lon, 'format':'json'})
            url = 'https://geo.fcc.gov/api/census/block/find?' + params
            response = requests.get(url)
            data = response.json()
            geoid = data['Block']['FIPS'][0:11]
            x += [geoid]
            
        waste = []
        waste = repo['misn15.waste'].find()
        waste = copy.deepcopy(waste)

        waste_list = []
        for x in waste:
            waste_list += [[x['Name'], x['Address'], [0, 0], x['ZIP Code']]]

        # get coordinates and fips for waste data
        for x in waste_list:
            if x[3][0] != '0':
                zipcode_num = '0' + x[3]
            elif len(x[3]) != 5:
                zipcode_num = x[3][0:5]
            if x[3][0] == 0 and x[3][1] == 0:
                zipcode_num = x[3][1:6]
            zipcode = zipcodes.matching(zipcode_num)
            if len(zipcode) != 0:
                x[2][0] = zipcode[0]['long']
                x[2][1] = zipcode[0]['lat']           

        # get FIPS census tract number for waste data
        for x in waste_list:
            lat = x[2][1]
            lon = x[2][0]
            if lat != 0:
                params = urllib.parse.urlencode({'latitude': lat, 'longitude':lon, 'format':'json'})
                url = 'https://geo.fcc.gov/api/census/block/find?' + params
                response = requests.get(url)
                data = response.json()
                geoid2 = data['Block']['FIPS'][0:11]
                x += [geoid2]
            else:
                x += ['0']             

        # merge oil sites with hazardous waste sites
            
        waste_merged = oil_subset + waste_list

        repo.dropCollection("misn15.waste_merged")
        repo.createCollection("misn15.waste_merged")

        for x in waste_merged:
            entry = {'Name': x[0], 'Address': x[1], 'Coordinates': x[2], 'Zip Code': x[3], 'FIPS': x[4]}
            repo['misn15.waste_merged'].insert_one(entry)        

        repo['misn15.waste_merged'].metadata({'complete':True})
        print(repo['misn15.waste_merged'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start":startTime, "end":endTime}
    def execute(trial=False):
        '''Transform health and waste data for city of Boston'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('misn15', 'misn15')

        # make deepcopy of data

        waste = []
        waste = repo['misn15.waste'].find()
        waste = copy.deepcopy(waste)

        health = []
        health = repo['misn15.health'].find()
        health = copy.deepcopy(health)

        health_pd = pd.DataFrame(health)
        health_pd = health_pd[pd.isnull(health_pd['tractfips']) == False]
        health_pd = health_pd[pd.isnull(health_pd['data_value']) == False]

        #filter health
        health_subset = health_pd[[
            "category", "measure", "data_value_unit", "data_value",
            "categoryid", "geolocation", "measureid", "short_question_text",
            "tractfips"
        ]]

        # convert to a list
        health_list = []
        for x in range(len(health_subset)):
            health_list += [list(health_subset.iloc[x, :])]

        # add zipcodes
        search = SearchEngine(simple_zipcode=True)
        for x in health_list:
            long = round(x[5]["coordinates"][0], 2)
            lat = round(x[5]["coordinates"][1], 2)
            result = search.by_coordinates(lat, long, returns=1)
            result = result[0]
            x += [result.zipcode]

        # get the product of waste and health data sets and project to
        product = []
        for row in waste:
            for i in health_list:
                product += [[row, i]]

        projection = [(x[0]['Name'], x[0]['Address'], x[1][0], x[1][3],
                       x[1][6], x[1][-1]) for x in product
                      if '0' + str(x[0]['ZIP Code']) == x[1][-1]]

        # filter out prevention; we only want actual illness
        no_prev = [x for x in projection if x[2] != "Prevention"]

        #get all the different types of illnesses
        keys = []
        for x in no_prev:
            keys += [x[4]]
        keys = set(keys)

        # append a dictionary of all illnesses and prevalence rates for every waste site

        agg_health = []
        prev = ''
        dict_disease = {}
        dict_disease[no_prev[0][4]] = [no_prev[0][3]]
        for x in no_prev:
            if x[0] == prev and x[4] not in dict_disease.keys():
                dict_disease[x[4]] = [x[3]]
                prev = x[0]
            elif x[0] == prev and x[4] in dict_disease.keys():
                dict_disease[x[4]] += [x[3]]
                prev = x[0]
            else:
                #print(dict_disease)
                agg_health += [[x[0], x[1], x[5], dict_disease]]
                #agg_health += [dict_disease]
                dict_disease = {}
                dict_disease[x[4]] = [x[3]]
                prev = x[0]

        repo.dropCollection("misn15.waste_health")
        repo.createCollection("misn15.waste_health")

        for x in agg_health:
            entry = {
                'Name': x[0],
                "Address": x[1],
                "Zip code": x[2],
                "Disease": x[3]
            }
            repo['misn15.waste_health'].insert_one(entry)

        repo['misn15.waste_health'].metadata({'complete': True})
        print(repo['misn15.waste_health'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 25
0
        matching_city = (matching_zip["county"] + ", " + matching_zip['state'])

    #Let the users know their input with a pop-up window
    sg.Popup(
        'Input Verification', "So you will be studying or working in " +
        matching_city + ". And you are willing to commute up to " +
        str(miles_to_commute) + " miles. You want a " + housing_style +
        " and your budget is " + budget + ". Click Ok to Proceed...")

    #using built-in data within uszipcode package to determine latitude and longtitude of the target zipcode/location
    latitude = matching_zip["lat"]
    longtitude = matching_zip["lng"]

    #using a built-in function within uszipcode package to search nearby neighborhodds, setting maximum 10 neighborhoods for now to shorten search time
    nearby_neighborhoods = search.by_coordinates(latitude,
                                                 longtitude,
                                                 radius=int(miles_to_commute),
                                                 returns=10)

    where_to_live = []
    housing_listing = []

    print("Below is a list of neighborhoods you can consider: \n")
    #search and append the results
    for i in range(0, len(nearby_neighborhoods)):
        city = nearby_neighborhoods[i].post_office_city
        if city not in where_to_live:
            where_to_live.append(city)
            if housing_style == "studio":
                housing_listing.append(
                    nearby_neighborhoods[i].
                    monthly_rent_including_utilities_studio_apt)
Esempio n. 26
0
    notfound = 0
    latmisscount = 0
    nazipcount = 0
    search = SearchEngine(simple_zipcode=True)
    for rowcount in range(0, len(df)):
        flag = 0
        stateflag = 0
        # if latitude/logitude is missing then zipcalc will be empty
        if (pd.isnull(df['y'][rowcount]) or pd.isnull(df['x'][rowcount])):
            latmisscount = latmisscount + 1
            zipcalc.append("")
# executes below when latitude/logitude is not empty
        else:
            # gets 30 mile radius list of zip codes
            data = search.by_coordinates(df['y'][rowcount],
                                         df['x'][rowcount],
                                         radius=50,
                                         returns=10)
            # if actual zipcode is empty count this as mismatch
            if (pd.isnull(df.zip[rowcount])):
                nazipcount = nazipcount + 1
                notfound = notfound + 1
                #zipcalc.append(df.zip[rowcount])
                # is zipcode is empty and database was not able to find any zipcodes leave the zipcalc to empty
                if (len(data) == 0):
                    zipcalc.append("")
                else:
                    # if zipcode is empty but database is not then set zipcalc to closest zipcode
                    zipcalc.append(data[0].zipcode)
            else:
                #if zipcode is found in database then loop through data t osee if zipcode is in 30mile radius of datbase data
                if (len(data) > 0):
Esempio n. 27
0
           ['RIT-MANHATTAN', (43.0844, -77.6749)],
           ['RIT-ROOSEVELT', (43.0844, -77.6749)]]

# In[58]:

station_lat_long = lat_long + missing

# In[60]:

# search latitude and longitude to get zipcode, population, population density, median household income
search = SearchEngine(simple_zipcode=True)
zip_list = []
for i in range(len(station_lat_long)):
    try:
        zipcode = search.by_coordinates(station_lat_long[i][1][0],
                                        station_lat_long[i][1][1],
                                        returns=1)
        zip_list.append([
            station_lat_long[i][0], station_lat_long[i][1][0],
            station_lat_long[i][1][1], zipcode[0].zipcode,
            zipcode[0].population, zipcode[0].population_density,
            zipcode[0].median_household_income
        ])
    # see if there is no match
    except:
        print(station_lat_long[i][0])

# In[61]:

# make the list into a dataframe
station_zip_demo = pd.DataFrame(zip_list,