def test_ba_geocoding(self): from ambry.geo.geocoder import Geocoder l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition g = Geocoder(gp) p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition for row in p.query("SELECT text from addresses where address_id is NULL limit 10"): text = row.text text = text.replace('La Jolla', 'San Diego') addr_id, r, parsed = g.parse_and_code(text) score = r['score'] if r else None print '------', score, addr_id print row.p print '> ', text print '< ', parsed
def test_csv_geocoding(self): from ambry.geo.geocoder import Geocoder import test.support as ts import os.path import csv l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition g = Geocoder(gp) with open(os.path.join(os.path.dirname(ts.__file__), 'bad_geocodes.csv')) as f: reader = csv.DictReader(f) for row in reader: text = row['text'] addr_id, r, parsed = g.parse_and_code(text) score = r['score'] if r else None print '------', score, addr_id print '> ', text print '< ', parsed
def test_geocoding_csv_geocoder(self): from ambry.geo.geocoder import Geocoder import test.support as ts import os.path import csv l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition g = Geocoder(gp) for row in gp.query("select * from geocoder where number > 0 limit 1000"): text = "{number} {dir} {name} {suffix}, {city}, {state} {zip}".format( number = row.number, name=row.name, state=row.state, city=row.city if row.city else '', dir = row.direction if row.direction != '-' else '', suffix=row.suffix if row.suffix != '-' else '', zip = row.zip if row.zip > 0 else '' ) addr_id, r, parsed = g.parse_and_code(text) if not r: score = r['score'] if r else None print '------', score, addr_id print '> ', text print '< ', parsed
def test_txt_geocoding(self): from ambry.geo.geocoder import Geocoder import test.support as ts import os.path import csv l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition city_subs = { 'La Jolla': 'San Diego' } g = Geocoder(gp, city_subs) with open(os.path.join(os.path.dirname(ts.__file__), 'bad_geocodes.txt')) as f: for line in f: text = line.strip() addr_id, r, parsed = g.parse_and_code(text) score = r['score'] if r else None print '------', score, addr_id print '> ', text print '< ', parsed
def test_csv_geocoding(self): from ambry.geo.geocoder import Geocoder import test.support as ts import os.path import csv l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition g = Geocoder(gp) with open( os.path.join(os.path.dirname(ts.__file__), 'bad_geocodes.csv')) as f: reader = csv.DictReader(f) for row in reader: text = row['text'] addr_id, r, parsed = g.parse_and_code(text) score = r['score'] if r else None print '------', score, addr_id print '> ', text print '< ', parsed
def test_ba_geocoding(self): from ambry.geo.geocoder import Geocoder l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition g = Geocoder(gp) p = l.get('sandiegodata.org-bad_addresses-casnd-addresses').partition for row in p.query( "SELECT text from addresses where address_id is NULL limit 10" ): text = row.text text = text.replace('La Jolla', 'San Diego') addr_id, r, parsed = g.parse_and_code(text) score = r['score'] if r else None print '------', score, addr_id print row.p print '> ', text print '< ', parsed
def test_txt_geocoding(self): from ambry.geo.geocoder import Geocoder import test.support as ts import os.path import csv l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition city_subs = {'La Jolla': 'San Diego'} g = Geocoder(gp, city_subs) with open( os.path.join(os.path.dirname(ts.__file__), 'bad_geocodes.txt')) as f: for line in f: text = line.strip() addr_id, r, parsed = g.parse_and_code(text) score = r['score'] if r else None print '------', score, addr_id print '> ', text print '< ', parsed
def test_geocoding_csv_geocoder(self): from ambry.geo.geocoder import Geocoder import test.support as ts import os.path import csv l = self.bundle.library gp = l.get('clarinova.com-geocode-casnd-geocoder->=2.0.8').partition g = Geocoder(gp) for row in gp.query( "select * from geocoder where number > 0 limit 1000"): text = "{number} {dir} {name} {suffix}, {city}, {state} {zip}".format( number=row.number, name=row.name, state=row.state, city=row.city if row.city else '', dir=row.direction if row.direction != '-' else '', suffix=row.suffix if row.suffix != '-' else '', zip=row.zip if row.zip > 0 else '') addr_id, r, parsed = g.parse_and_code(text) if not r: score = r['score'] if r else None print '------', score, addr_id print '> ', text print '< ', parsed
def x_test_crime(self): from ambry.geo.address import Parser from ambry.geo.geocoder import Geocoder import csv g = Geocoder(self.bundle.library, addresses_ds='geoaddresses') _,incidents = self.bundle.library.dep('crime') log_rate = self.bundle.init_log_rate(1000) p = Parser() with open(self.bundle.filesystem.path('errors.csv'), 'wb') as f: writer = csv.writer(f) writer.writerow(['code','arg','block_address','city','number','dir','street','type']) multi_cities = 0.0 multi_addr = 0.0 no_response = 0.0 for i, inct in enumerate(incidents.query("SELECT * FROM incidents limit 100000")): row = dict(inct) candidates = g.geocode_semiblock(row['blockaddress'], row['city'], 'CA') if len(candidates) == 0: no_response += 1 self.write_error_row('norsp',0, p,writer,row['blockaddress'], row['city']) continue elif len(candidates) != 1: multi_cities += 1 self.write_error_row('mcities',len(candidates), p,writer,row['blockaddress'], row['city']) continue s = candidates.popitem()[1] if len(s) > 3: self.write_error_row('maddr',len(s), p,writer,row['blockaddress'], row['city']) multi_addr +=1 if i > 0: log_rate("{} cities={}, {}% addr={}, {}% nrp={}, {}%".format(i, multi_cities, int(multi_cities/i * 100), multi_addr, int(multi_addr/i * 100), no_response, int(no_response/i * 100) ))
def test_basic(self): from ambry.geo.geocoder import Geocoder g = Geocoder(self.bundle.library) filename = "good_segments" f_input = os.path.join(os.path.dirname(__file__), '../support',filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), '../support',filename + '.out.csv') with open(f_input) as f: for line in f: addr = line.strip() r = g.geocode_address(addr) print "==", addr print "->",r if r: print " ", r['codedaddress']
def build_alcohol(self, p): from address_parser import Parser from ambry.geo.geocoder import Geocoder gp = self.library.dep('geocoder').partition g = Geocoder(gp) ap = Parser() ip = self.library.dep('alcohol').partition lr = self.init_log_rate(1000) with p.inserter() as ins: for row in ip.query("SELECT * FROM licenses"): lr() if not row['premisesaddress']: continue try: address_id, result, parsed = g.parse_and_code(row['premisesaddress']) except AttributeError as e: print e continue d = parsed.args d['text'] = str(parsed) d['orig_text'] = row['premisesaddress'] d['source'] = 'alco' d['address_id'] = address_id if result: d['score'] = result['score'] ins.insert(d) return True
def build_masterlist(self, p): from address_parser import Parser from ambry.geo.geocoder import Geocoder gp = self.library.dep('geocoder').partition g = Geocoder(gp) ap = Parser() ip = self.library.dep('masterlist').partition lr = self.init_log_rate(1000) streets = set() with p.inserter() as ins: for row in ip.query("SELECT * FROM businesses WHERE address_id IS NULL"): row = dict(row) row['city'] = row['city'].strip().title() if row['city'] else '' if row['city'].strip().title() == 'La Jolla': row['city'] = 'San Diego' ps = ap.parse(row['address'], row['city'], row['state'], row['zip']) try: address_id, result, parsed = g.parse_and_code(str(ps)) except AttributeError as e: print e raise continue d = ps.args d['text'] = str(ps) d['orig_text'] = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip']) d['source'] = 'sdbml' d['address_id'] = address_id k = (d['direction'], d['name'], d['suffix']) if not k in streets: streets.add(k) d['for_testing'] = 'y' ins.insert(d) lr() #print ps return True
def build_ck_geocoder(self): """Create a crosswalk to CK geocoded addresses, which link to SANDAG data""" from ambry.geo.geocoder import Geocoder city_subs = { 'La Jolla': 'San Diego' } g = Geocoder(self.library.dep('geocoder').partition, city_subs) lr = self.init_log_rate(250) businesses = self.partitions.find(table='businesses') p = self.partitions.find_or_new(table = 'ck_addresses') p.clean() good = 0 bad = 0 with p.inserter() as ins: for i, bus in enumerate(businesses.rows): row = { 'businesses_id' : bus['id'] } try: # This just lets us know what addresses aren't geocoding. We'll use the faulures # as bad addresses in a geocoder update. if bus['city']: row['address_id'], result, parsed = g.parse_and_code(bus['address'], city=bus['city'].title(), state = "CA", zip=bus['zip']) row['parsed_addr'] = "{}, {}, CA {}".format(parsed.text, parsed.locality.city, parsed.locality.zip) if result: row.update(result) row['name'] = ( row['direction']+' ' if row['direction'] else '' + row['name']+ ' '+row['suffix'] if row['suffix'] else '' ) row['id'] = None good += 1 else: bad += 1 except Exception as e: self.error("Failed to parse row {}: {} : {} ".format(i, bus['address'], e.message)) raise lr("Geocode CK: {} good / {} bad ( {}%) of {}".format(good, bad, round(float(good) / float(good+bad) *100,1), good+bad )) ins.insert(row) if self.run_args.test and i > 500: break