def test_geocode(self): from ambry.geo.geocoder import Geocoder from address_parser import Parser import csv import ambry import os import csv l = ambry.library() gp = l.get('clarinova.com-geocode-casnd-geocoder').partition f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv') q = """ SELECT * FROM geocoder WHERE name = :name AND direction = :direction AND suffix = suffix """ p = Parser() with open(f_intersections) as f: reader = csv.DictReader(f) for r in reader: ps = p.parse('1000 '+r['primary_rd']) print ps.road.dict for qr in gp.query(q,**ps.road.dict): print " ", qr
def test_geocode(self): from ambry.geo.geocoder import Geocoder from address_parser import Parser import csv import ambry import os import csv l = ambry.library() gp = l.get('clarinova.com-geocode-casnd-geocoder').partition f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv') q = """ SELECT * FROM geocoder WHERE name = :name AND direction = :direction AND suffix = suffix """ p = Parser() with open(f_intersections) as f: reader = csv.DictReader(f) for r in reader: ps = p.parse('1000 ' + r['primary_rd']) print ps.road.dict for qr in gp.query(q, **ps.road.dict): print " ", qr
def test_address_files(self): import os from address_parser import Parser import csv parser = Parser() success = 0 failure = 0 total = 0 filename = "crime_addresses" f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 print '----' print line.strip() try: ps = parser.parse(line) if not ps: failure += 1 continue except Exception as e: print "ERROR", e failure += 1 continue print ps continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) #writer.writerow(d) print d.keys() if not ps.city: failure += 1 print d print ps print else: success += 1 print print "total={} success={} failure={} rate={}".format( total, success, failure, round((float(failure) / float(total) * 100), 3))
def test_hash(self): from pprint import pprint a1 = '119 WEST WINTON AVENUE, HAYWARD, CA, 94544' a2 = '119 Winton Ave., Hayward, Ca, 94544-5000' parser = Parser() r = parser.parse(a2) pprint(r.dict)
def test_address_files(self): import os from address_parser import Parser import csv parser = Parser() success = 0 failure = 0 total = 0 filename = "crime_addresses" f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 print '----' print line.strip() try: ps = parser.parse(line) if not ps: failure += 1 continue except Exception as e: print "ERROR", e failure += 1 continue print ps continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) # writer.writerow(d) print d.keys() if not ps.city: failure += 1 print d print ps print else: success += 1 print print "total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure) / float(total) * 100), 3))
def address_gen(): """Produce blocks addresses that are randomized within the 100 block, if possible, and return the original address if it isn't """ from random import randint from address_parser import Parser parser = Parser() p = self.partition(table='crimeb') for row in p: if not row.block_address: continue ba = str(row.block_address).replace('EL CAM', 'El Camino') ps = parser.parse(ba) if ps and ps.number.number and ps.number.number > 0: ps.number.number = \ int( round(ps.number.number, -2)) + \ randint(0,100) street_num = str(ps) else: street_num = ba.replace('BLOCK', '') city = row.city if not row.city and row.agency != 'SHERRIF': city = row.agency if not city: city = '' zipcode = ', {}'.format(row.zipcode) if row.zipcode else '' address = '{} {} CA{}'.format(street_num, city, zipcode) yield (address,row)
def chunked_geocode(addresses, state=None, chunk_size=250): # Each address entry must be a tuple of (unique_id, address) parser = Parser() row_n = 0 request_rows = [] for uid, address_line in addresses: p = parser.parse(address_line) rr = [ uid, p.street_str(), p.locality.city, state or p.locality.state, p.locality.zip ] request_rows.append(rr) if len(request_rows) > chunk_size: for row in make_request(request_rows): # row colums are: # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips yield row_n, True, mkdict(row) row_n += 1 request_rows = [] for row in make_request(request_rows): # row colums are: # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips yield row_n, True, mkdict(row) row_n += 1
class Geocoder(object): def __init__(self, partition, city_subs=None): from address_parser import Parser self.p = partition self.address_cache = {} self.city_subs = { k.lower(): v for k, v in city_subs.items()} if city_subs else {} self.parser = Parser() def parse_and_code(self, addrstr, city=None, state=None, zip=None): adr = self.parser.parse(addrstr, city=city, state=state, zip=zip) if adr.hash in self.address_cache: r = self.address_cache[adr.hash] if r: address_id = r['address_id'] else: address_id = None else: r = self.geocode(**adr.args) if r: address_id = r['address_id'] self.address_cache[adr.hash] = r else: self.address_cache[adr.hash] = None address_id = None return address_id, r, adr def geocode(self, number, name, direction=None, suffix=None, city=None, state=None, zip=None): """Return a record from the geocoder table. This function expects a partition, p, that holds a table named 'gecoder', of the same structure as used in clarinova.com-geocode-casnd """ direction = direction.upper() if direction else '-' suffix = suffix.title() if suffix else '-' city = city.title() if city else '-' if city.lower() in self.city_subs: city = self.city_subs[city.lower()].title() if isinstance(zip, basestring) and '-' in zip: zip, zsuffix = zip.split('-') zip = zip if zip else -1 try: zip = int(zip) except: zip = -1 suffix = suffix.lower() # We don't need to check for nulls in direction, b/c entries without # directions have the value '-' q = """ SELECT *, ( CASE WHEN city = :city THEN 10 ELSE 0 END + CASE WHEN zip = :zip THEN 10 ELSE 0 END + CASE WHEN suffix = :suffix THEN 10 ELSE 0 END ) AS score, ABS(number - :number) as ndist FROM geocoder WHERE name = :name AND direction = :direction AND score >= 20 AND number BETWEEN (:number-100) AND (:number+100) ORDER BY ABS(number - :number), score LIMIT 1; """ r = self.p.query( q, number=number, name=name, direction=direction, suffix=suffix, city=city, state=state, zip=zip).first() if not r: return None r = dict(r) r['confidence'] = round( (100.0 - (30.0 - r['score']) - (r['ndist'] / 2.0)) / 100.0, 3) r['lat'] = float(r['lat']) / 100000000.0 r['lon'] = float(r['lon']) / 100000000.0 return r def geocode_intersection(self, street1, street2): pass
with open('tickets.pkl', 'rb') as f: tickets_acc = pickle.load(f) from collections import defaultdict d = defaultdict(lambda : [0,0]) def mkstreet(number, name, suffix): return "{} {} {}".format(number,name, suffix) # Create a dict of date/street pairs, then mark them for if the pair # was swept, then if the pair was ticketed for base_street, dates in gps_acc.items(): ps = parser.parse(base_street) # Expand each street block to the 100 block before and after, to deal # with possible missing GPS reverse-geocodes streets = [mkstreet(ps.number.number, ps.road.name, ps.road.suffix), mkstreet(ps.number.number+100, ps.road.name, ps.road.suffix), mkstreet(ps.number.number+200, ps.road.name, ps.road.suffix), mkstreet(ps.number.number+300, ps.road.name, ps.road.suffix) ] if ps.number.number >= 100: streets.append(mkstreet(ps.number.number-100, ps.road.name, ps.road.suffix)) if ps.number.number >= 200: streets.append(mkstreet(ps.number.number-200, ps.road.name, ps.road.suffix))
def build_masterlist(self, p): from address_parser import Parser from ambry.geo.geocoder import Geocoder gp = self.library.dep('geocoder').partition g = Geocoder(gp) ap = Parser() ip = self.library.dep('masterlist').partition lr = self.init_log_rate(1000) streets = set() with p.inserter() as ins: for row in ip.query("SELECT * FROM businesses WHERE address_id IS NULL"): row = dict(row) row['city'] = row['city'].strip().title() if row['city'] else '' if row['city'].strip().title() == 'La Jolla': row['city'] = 'San Diego' ps = ap.parse(row['address'], row['city'], row['state'], row['zip']) try: address_id, result, parsed = g.parse_and_code(str(ps)) except AttributeError as e: print e raise continue d = ps.args d['text'] = str(ps) d['orig_text'] = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip']) d['source'] = 'sdbml' d['address_id'] = address_id k = (d['direction'], d['name'], d['suffix']) if not k in streets: streets.add(k) d['for_testing'] = 'y' ins.insert(d) lr() #print ps return True
from address_parser import Parser import re addresss = '387 View Ave apt4 Twin Falls, ID 83301' info = 'Email: [email protected] Phone: +1 (956) 8574114' email = re.findall('\S+@\S+', info) ok = re.findall('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', info) parser = Parser() adr = parser.parse(addresss) print(f'{adr.number.number} {adr.road.direction} {adr.road.name} {adr.road.suffix}') print(f'{adr.text}') print(f'{email[0]}') print(f'{ok[0]}')
import time f = MPRowsFile('/Users/eric/proj/virt/ambry10/library/build/nbcuni.com/streetsweep/nbcuni.com/streetsweep-0.0.1/tickets.mpr') parser = Parser() start = time.time() s = 0 from collections import defaultdict acc = defaultdict(set) with f.reader as r: for i, row in enumerate(r, 1): adr = row.locationdesc1 if adr: ps = parser.parse(adr) dt = row.issuedate if ps.number.number > 0 and dt: number = int(ps.number.number / 100) * 100 key = "{} {} {}".format(number, ps.road.name, ps.road.suffix) print row acc[key].add(dt) if i % 10000 == 0: print i, len(acc), round( float(i) / ( time.time() - start), 2) with open('tickets.pkl', 'wb') as f: pickle.dump(acc, f)
def get_pubmed_data(self, query, searched_zipcode, date, maximum_number_of_value=3): csv_data = { "affiliation": [], "number_of_authors": [], "authors_name": [], "authors_institute": [], "authors_address": [], "authors_zipcode": [], "paper_title": [], "publication_date": [], "journal": [] } pubmed = PubMed(tool="MyTool", email="*****@*****.**") parser = Parser() results = pubmed.query(query, max_results=maximum_number_of_value) is_queried_by_zipcode = searched_zipcode.isdecimal() if is_queried_by_zipcode: searched_zipcode = int(searched_zipcode) for article in results: jsonData = json.loads(article.toJSON()) authors_list = jsonData['authors'] authors_name = "" authors_institute = "" authors_affiliation = "" authors_address = "" authors_zipcode = "" num_authors = len(authors_list) or 0 counted_matched = 0 if is_queried_by_zipcode: counted_matched = self.has_match_zipcode_of_authprs( authors_list, searched_zipcode) if (not is_queried_by_zipcode) or (is_queried_by_zipcode and counted_matched > 0): for index in range(0, num_authors): affiliation = authors_list[index][ "affiliation"] or "<NOT_AVAILABLE>" zipcode = str(self.get_address_with_zipcode(affiliation)) # print(type(zipcode)) # print(zipcode) author_name = authors_list[index][ 'firstname'] + " " + authors_list[index][ "lastname"] or "<NOT_AVAILABLE>" author_institute = "" author_institute += self.get_organization( affiliation=affiliation) + " " authors_affiliation += affiliation authors_name += author_name authors_institute += author_institute authors_address += str(parser.parse(affiliation)) authors_zipcode += zipcode if num_authors != index + 1: authors_name += "||" authors_institute += "||" authors_affiliation += "||" authors_address += "||" authors_zipcode += "||" else: break paper_title = jsonData['title'] or "<NOT_AVAILABLE>" publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>" journal = jsonData['journal'] or "<NOT_AVAILABLE>" if self.is_us: if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): csv_data["authors_name"].append(authors_name) csv_data["affiliation"].append(authors_affiliation) csv_data["authors_institute"].append(authors_institute) csv_data["paper_title"].append(paper_title) csv_data["publication_date"].append(publication_date) csv_data["journal"].append(journal) csv_data["authors_address"].append(authors_address) csv_data["number_of_authors"].append(num_authors) csv_data["authors_zipcode"].append(authors_zipcode) self.is_us = False # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): # # df = pd.DataFrame(csv_data) # # print(df.head()) # df.to_csv("PubMedData_from.csv", index=False) print("Size of csv ", len(csv_data["paper_title"])) if len(csv_data["paper_title"]) > 0: df = pd.DataFrame(csv_data) print(df.head()) datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d') csv_file_name = "PubMedData_From_" + datetimeobject.strftime( '%Y_%m_%d') + ".csv" print(csv_file_name) df.to_csv(csv_file_name, index=False)
def test_address_files(self): import os import csv parser = Parser() success = 0 failure = 0 total = 0 for filename in ["crime_addresses"]: f_input = os.path.join(os.path.dirname(__file__), 'support',filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support',filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 try: ps = parser.parse(line) if not ps: failure += 1 continue except TypeError: raise except Exception as e: print("ERROR", e) failure += 1 continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) d2 = dict(d.items()) del d2['hash'] del d2['locality'] del d2['text'] del d2['road'] writer.writerow(d2) # THe parser strips 'BLOCK', and '/' is an intersection if line.strip() != str(ps) and 'block' not in line.lower() and '/' not in line: failure += 1 print('-----') print(line.strip()) print(ps) print() else: success += 1 print () print ("total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3)))