def test_geocode(self): from ambry.geo.geocoder import Geocoder from address_parser import Parser import csv import ambry import os import csv l = ambry.library() gp = l.get('clarinova.com-geocode-casnd-geocoder').partition f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv') q = """ SELECT * FROM geocoder WHERE name = :name AND direction = :direction AND suffix = suffix """ p = Parser() with open(f_intersections) as f: reader = csv.DictReader(f) for r in reader: ps = p.parse('1000 '+r['primary_rd']) print ps.road.dict for qr in gp.query(q,**ps.road.dict): print " ", qr
def test_geocode(self): from ambry.geo.geocoder import Geocoder from address_parser import Parser import csv import ambry import os import csv l = ambry.library() gp = l.get('clarinova.com-geocode-casnd-geocoder').partition f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv') q = """ SELECT * FROM geocoder WHERE name = :name AND direction = :direction AND suffix = suffix """ p = Parser() with open(f_intersections) as f: reader = csv.DictReader(f) for r in reader: ps = p.parse('1000 ' + r['primary_rd']) print ps.road.dict for qr in gp.query(q, **ps.road.dict): print " ", qr
def test_address_files(self): import os from address_parser import Parser import csv parser = Parser() success = 0 failure = 0 total = 0 filename = "crime_addresses" f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 print '----' print line.strip() try: ps = parser.parse(line) if not ps: failure += 1 continue except Exception as e: print "ERROR", e failure += 1 continue print ps continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) #writer.writerow(d) print d.keys() if not ps.city: failure += 1 print d print ps print else: success += 1 print print "total={} success={} failure={} rate={}".format( total, success, failure, round((float(failure) / float(total) * 100), 3))
def test_hash(self): from pprint import pprint a1 = '119 WEST WINTON AVENUE, HAYWARD, CA, 94544' a2 = '119 Winton Ave., Hayward, Ca, 94544-5000' parser = Parser() r = parser.parse(a2) pprint(r.dict)
def test_address_files(self): import os from address_parser import Parser import csv parser = Parser() success = 0 failure = 0 total = 0 filename = "crime_addresses" f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 print '----' print line.strip() try: ps = parser.parse(line) if not ps: failure += 1 continue except Exception as e: print "ERROR", e failure += 1 continue print ps continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) # writer.writerow(d) print d.keys() if not ps.city: failure += 1 print d print ps print else: success += 1 print print "total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure) / float(total) * 100), 3))
def __init__(self, partition, city_subs=None): from address_parser import Parser self.p = partition self.address_cache = {} self.city_subs = { k.lower(): v for k, v in city_subs.items()} if city_subs else {} self.parser = Parser()
def address_gen(): """Produce blocks addresses that are randomized within the 100 block, if possible, and return the original address if it isn't """ from random import randint from address_parser import Parser parser = Parser() p = self.partition(table='crimeb') for row in p: if not row.block_address: continue ba = str(row.block_address).replace('EL CAM', 'El Camino') ps = parser.parse(ba) if ps and ps.number.number and ps.number.number > 0: ps.number.number = \ int( round(ps.number.number, -2)) + \ randint(0,100) street_num = str(ps) else: street_num = ba.replace('BLOCK', '') city = row.city if not row.city and row.agency != 'SHERRIF': city = row.agency if not city: city = '' zipcode = ', {}'.format(row.zipcode) if row.zipcode else '' address = '{} {} CA{}'.format(street_num, city, zipcode) yield (address,row)
def chunked_geocode(addresses, state=None, chunk_size=250): # Each address entry must be a tuple of (unique_id, address) parser = Parser() row_n = 0 request_rows = [] for uid, address_line in addresses: p = parser.parse(address_line) rr = [ uid, p.street_str(), p.locality.city, state or p.locality.state, p.locality.zip ] request_rows.append(rr) if len(request_rows) > chunk_size: for row in make_request(request_rows): # row colums are: # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips yield row_n, True, mkdict(row) row_n += 1 request_rows = [] for row in make_request(request_rows): # row colums are: # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips yield row_n, True, mkdict(row) row_n += 1
def build_masterlist(self, p): from address_parser import Parser from ambry.geo.geocoder import Geocoder gp = self.library.dep('geocoder').partition g = Geocoder(gp) ap = Parser() ip = self.library.dep('masterlist').partition lr = self.init_log_rate(1000) streets = set() with p.inserter() as ins: for row in ip.query("SELECT * FROM businesses WHERE address_id IS NULL"): row = dict(row) row['city'] = row['city'].strip().title() if row['city'] else '' if row['city'].strip().title() == 'La Jolla': row['city'] = 'San Diego' ps = ap.parse(row['address'], row['city'], row['state'], row['zip']) try: address_id, result, parsed = g.parse_and_code(str(ps)) except AttributeError as e: print e raise continue d = ps.args d['text'] = str(ps) d['orig_text'] = "{}, {}, {} {}".format(row['address'], row['city'], row['state'], row['zip']) d['source'] = 'sdbml' d['address_id'] = address_id k = (d['direction'], d['name'], d['suffix']) if not k in streets: streets.add(k) d['for_testing'] = 'y' ins.insert(d) lr() #print ps return True
import pandas as pd from scrapy.selector import Selector from bs4 import BeautifulSoup import re import time import requests import xml.etree.ElementTree as etree import pyap from urllib.parse import urlparse from selenium import webdriver from shutil import which from scrapy_selenium import SeleniumRequest from address_parser import Parser import requests address_parser = Parser() #yield SeleniumRequest(url=url, callback=self.parse_result) """SELENIUM_DRIVER_NAME = 'chrome' driver_path = which('/home/val/coding/chromedriver') SELENIUM_DRIVER_ARGUMENTS=['-headless']""" options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(executable_path='/home/val/coding/chromedriver', options=options) def fill_data_base(frame): # Automatic filling the data base with scraped information
from address_parser import Parser import re addresss = '387 View Ave apt4 Twin Falls, ID 83301' info = 'Email: [email protected] Phone: +1 (956) 8574114' email = re.findall('\S+@\S+', info) ok = re.findall('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', info) parser = Parser() adr = parser.parse(addresss) print(f'{adr.number.number} {adr.road.direction} {adr.road.name} {adr.road.suffix}') print(f'{adr.text}') print(f'{email[0]}') print(f'{ok[0]}')
print(query) # Execute the query against the API results = pubmed.query(query, max_results=3) # Loop over the retrieved articles for article in results: # Print the type of object we've found (can be either PubMedBookArticle or PubMedArticle) # print(type(article)) # Print a JSON representation of the object #print(article.toJSON()) # parse x: jsonData = json.loads(article.toJSON()) # the result is a Python dictionary: # print(jsonData['authors']) authors_list = jsonData['authors'] parser = Parser() for index in range(0, len(authors_list)): print("affiliation ", authors_list[index]["affiliation"]) # print("Author number ",index," ", authors_list[index]['firstname']," ",authors_list[index]["lastname"]) # line = re.sub(r"[\w\W]* ((Hospital|University|Centre|Law School|School|Academy|Department)[\w -]*)[\w\W]*$", # r"\1", authors_list[index]["affiliation"]) # print("Institute name ",line) affiliation = authors_list[index]["affiliation"] # adr = parser.parse(affiliation) # print(adr.dict) # print(adr)
class Geocoder(object): def __init__(self, partition, city_subs=None): from address_parser import Parser self.p = partition self.address_cache = {} self.city_subs = { k.lower(): v for k, v in city_subs.items()} if city_subs else {} self.parser = Parser() def parse_and_code(self, addrstr, city=None, state=None, zip=None): adr = self.parser.parse(addrstr, city=city, state=state, zip=zip) if adr.hash in self.address_cache: r = self.address_cache[adr.hash] if r: address_id = r['address_id'] else: address_id = None else: r = self.geocode(**adr.args) if r: address_id = r['address_id'] self.address_cache[adr.hash] = r else: self.address_cache[adr.hash] = None address_id = None return address_id, r, adr def geocode(self, number, name, direction=None, suffix=None, city=None, state=None, zip=None): """Return a record from the geocoder table. This function expects a partition, p, that holds a table named 'gecoder', of the same structure as used in clarinova.com-geocode-casnd """ direction = direction.upper() if direction else '-' suffix = suffix.title() if suffix else '-' city = city.title() if city else '-' if city.lower() in self.city_subs: city = self.city_subs[city.lower()].title() if isinstance(zip, basestring) and '-' in zip: zip, zsuffix = zip.split('-') zip = zip if zip else -1 try: zip = int(zip) except: zip = -1 suffix = suffix.lower() # We don't need to check for nulls in direction, b/c entries without # directions have the value '-' q = """ SELECT *, ( CASE WHEN city = :city THEN 10 ELSE 0 END + CASE WHEN zip = :zip THEN 10 ELSE 0 END + CASE WHEN suffix = :suffix THEN 10 ELSE 0 END ) AS score, ABS(number - :number) as ndist FROM geocoder WHERE name = :name AND direction = :direction AND score >= 20 AND number BETWEEN (:number-100) AND (:number+100) ORDER BY ABS(number - :number), score LIMIT 1; """ r = self.p.query( q, number=number, name=name, direction=direction, suffix=suffix, city=city, state=state, zip=zip).first() if not r: return None r = dict(r) r['confidence'] = round( (100.0 - (30.0 - r['score']) - (r['ndist'] / 2.0)) / 100.0, 3) r['lat'] = float(r['lat']) / 100000000.0 r['lon'] = float(r['lon']) / 100000000.0 return r def geocode_intersection(self, street1, street2): pass
from ambry_sources.mpf import MPRowsFile from address_parser import Parser import cPickle as pickle from address_parser import Parser parser = Parser() with open('gps_dump.pkl', 'rb') as f: gps_acc = pickle.load(f) with open('tickets.pkl', 'rb') as f: tickets_acc = pickle.load(f) from collections import defaultdict d = defaultdict(lambda : [0,0]) def mkstreet(number, name, suffix): return "{} {} {}".format(number,name, suffix) # Create a dict of date/street pairs, then mark them for if the pair # was swept, then if the pair was ticketed for base_street, dates in gps_acc.items(): ps = parser.parse(base_street) # Expand each street block to the 100 block before and after, to deal # with possible missing GPS reverse-geocodes streets = [mkstreet(ps.number.number, ps.road.name, ps.road.suffix), mkstreet(ps.number.number+100, ps.road.name, ps.road.suffix),
def get_pubmed_data(self, query, searched_zipcode, date, maximum_number_of_value=3): csv_data = { "affiliation": [], "number_of_authors": [], "authors_name": [], "authors_institute": [], "authors_address": [], "authors_zipcode": [], "paper_title": [], "publication_date": [], "journal": [] } pubmed = PubMed(tool="MyTool", email="*****@*****.**") parser = Parser() results = pubmed.query(query, max_results=maximum_number_of_value) is_queried_by_zipcode = searched_zipcode.isdecimal() if is_queried_by_zipcode: searched_zipcode = int(searched_zipcode) for article in results: jsonData = json.loads(article.toJSON()) authors_list = jsonData['authors'] authors_name = "" authors_institute = "" authors_affiliation = "" authors_address = "" authors_zipcode = "" num_authors = len(authors_list) or 0 counted_matched = 0 if is_queried_by_zipcode: counted_matched = self.has_match_zipcode_of_authprs( authors_list, searched_zipcode) if (not is_queried_by_zipcode) or (is_queried_by_zipcode and counted_matched > 0): for index in range(0, num_authors): affiliation = authors_list[index][ "affiliation"] or "<NOT_AVAILABLE>" zipcode = str(self.get_address_with_zipcode(affiliation)) # print(type(zipcode)) # print(zipcode) author_name = authors_list[index][ 'firstname'] + " " + authors_list[index][ "lastname"] or "<NOT_AVAILABLE>" author_institute = "" author_institute += self.get_organization( affiliation=affiliation) + " " authors_affiliation += affiliation authors_name += author_name authors_institute += author_institute authors_address += str(parser.parse(affiliation)) authors_zipcode += zipcode if num_authors != index + 1: authors_name += "||" authors_institute += "||" authors_affiliation += "||" authors_address += "||" authors_zipcode += "||" else: break paper_title = jsonData['title'] or "<NOT_AVAILABLE>" publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>" journal = jsonData['journal'] or "<NOT_AVAILABLE>" if self.is_us: if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): csv_data["authors_name"].append(authors_name) csv_data["affiliation"].append(authors_affiliation) csv_data["authors_institute"].append(authors_institute) csv_data["paper_title"].append(paper_title) csv_data["publication_date"].append(publication_date) csv_data["journal"].append(journal) csv_data["authors_address"].append(authors_address) csv_data["number_of_authors"].append(num_authors) csv_data["authors_zipcode"].append(authors_zipcode) self.is_us = False # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): # # df = pd.DataFrame(csv_data) # # print(df.head()) # df.to_csv("PubMedData_from.csv", index=False) print("Size of csv ", len(csv_data["paper_title"])) if len(csv_data["paper_title"]) > 0: df = pd.DataFrame(csv_data) print(df.head()) datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d') csv_file_name = "PubMedData_From_" + datetimeobject.strftime( '%Y_%m_%d') + ".csv" print(csv_file_name) df.to_csv(csv_file_name, index=False)
from ambry_sources.mpf import MPRowsFile from address_parser import Parser import cPickle as pickle import time f = MPRowsFile('/Users/eric/proj/virt/ambry10/library/build/nbcuni.com/streetsweep/nbcuni.com/streetsweep-0.0.1/tickets.mpr') parser = Parser() start = time.time() s = 0 from collections import defaultdict acc = defaultdict(set) with f.reader as r: for i, row in enumerate(r, 1): adr = row.locationdesc1 if adr: ps = parser.parse(adr) dt = row.issuedate if ps.number.number > 0 and dt: number = int(ps.number.number / 100) * 100 key = "{} {} {}".format(number, ps.road.name, ps.road.suffix) print row acc[key].add(dt) if i % 10000 == 0:
def test_address_files(self): import os import csv parser = Parser() success = 0 failure = 0 total = 0 for filename in ["crime_addresses"]: f_input = os.path.join(os.path.dirname(__file__), 'support',filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support',filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 try: ps = parser.parse(line) if not ps: failure += 1 continue except TypeError: raise except Exception as e: print("ERROR", e) failure += 1 continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) d2 = dict(d.items()) del d2['hash'] del d2['locality'] del d2['text'] del d2['road'] writer.writerow(d2) # THe parser strips 'BLOCK', and '/' is an intersection if line.strip() != str(ps) and 'block' not in line.lower() and '/' not in line: failure += 1 print('-----') print(line.strip()) print(ps) print() else: success += 1 print () print ("total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3)))