def build_addresses(self): """Geocode the addresses and build an address table""" from ambry.geo.geocoders import DstkGeocoder facilities = self.partitions.find(table='facilities') def address_gen(): for row in facilities.query("SELECT * FROM facilities"): address = "{}, {}, {} {}".format(row['dba_address1'], row['dba_city'], 'CA', row['dba_zip_code']) yield (address, row) dstk_service = self.config.service('dstk') dstk_gc = DstkGeocoder(dstk_service, address_gen()) p = self.partitions.find_or_new(table='facilities_addresses') p.clean() lr = self.init_log_rate(500) with p.inserter() as ins: for i, (k, r, inp_row) in enumerate(dstk_gc.geocode()): lr("Addresses " + str(i)) r['facilities_id'] = inp_row['id'] ins.insert(r)
def build_dstk_geocoder(self): """Geocode with the Data Science Toolkit""" from ambry.geo.geocoders import DstkGeocoder lr = self.init_log_rate(250) businesses = self.partitions.find(table='businesses') def address_gen(): for row in businesses.query("SELECT * FROM businesses"): address = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip']) yield (address, row) dstk_service = self.config.service('dstk') dstk_gc = DstkGeocoder(dstk_service, address_gen()) p = self.partitions.find_or_new(table = 'dstk_addresses') p.clean() good = 0 bad = 0 with p.inserter() as ins: for i, (k, r, inp_row) in enumerate(dstk_gc.geocode()): row = { 'businesses_id' : inp_row['id'] } if r: row.update(dict(r)) row['number'] = r.get('street_number', None) row['name'] = r.get('street_name', None) row['city'] = r.get('locality', None) row['state'] = r.get('region', None) row['lat'] = r.get('latitude', None) row['lon'] = r.get('longitude', None) row['county'] = r.get('fips_county', None) lr("Geocode DSTK") ins.insert(row) if self.run_args.test and i > 500: break
def test_dstk_geocoding(self): from ambry.geo.geocoders import DstkGeocoder import pprint l = self.bundle.library p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition dstk_service = self.rc.service('dstk') def address_gen(): for row in p.query("SELECT text from addresses where address_id is NULL limit 20"): text = row.text yield text dstk_gc = DstkGeocoder(dstk_service, address_gen()) for k, r in dstk_gc.geocode(): print '---' print "{:6s} {}".format(str(r['confidence']) if r else '', k) pprint.pprint(r)
def test_dstk_geocoding(self): from ambry.geo.geocoder import Geocoder from ambry.geo.geocoders import DstkGeocoder import pprint l = self.bundle.library p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition dstk_service = self.rc.service('dstk') def address_gen(): for row in p.query( "SELECT text from addresses where address_id is NULL limit 20" ): text = row.text yield text dstk_gc = DstkGeocoder(dstk_service, address_gen()) for k, r in dstk_gc.geocode(): print '---' print "{:6s} {}".format(str(r['confidence']) if r else '', k) pprint.pprint(r)
def generate_agencies(self): """Load the agency list from the web, and yield geocoded address records""" import csv from ambry.geo.geocoders import DstkGeocoder from collections import defaultdict def address_gen(): for row in self.partitions.find(table='sdfb_partners').rows: yield ("{} {}, CA {}".format(row['addr1'].decode('ascii','ignore'), row['city'], row['zip']), (row['agencyref'].strip(), None, row['agencyname'].strip())) for row in self.partitions.find(table='agency_list').rows: yield (row['address'].decode('ascii','ignore'), (row['agency_id'], row['site_id'], row['name'].strip())) dstk_gc = DstkGeocoder(self.config.service('dstk'), address_gen()) header = 'agency_id site_id name orig_address geocoded_address city lat lon'.split() for i, (k, r, o) in enumerate(dstk_gc.geocode()): row = [o[0],o[1],o[2],k] if r: row += [r['street_address'], r['locality'], r['latitude'], r['longitude']] yield dict( i = i, address = k, geocoded = r, row = dict(zip(header, row )) )