def test_geocode(self): from ambry.geo.geocoder import Geocoder from address_parser import Parser import csv import ambry import os import csv l = ambry.library() gp = l.get('clarinova.com-geocode-casnd-geocoder').partition f_intersections = os.path.join(os.path.dirname(__file__), 'support', 'intersections.csv') q = """ SELECT * FROM geocoder WHERE name = :name AND direction = :direction AND suffix = suffix """ p = Parser() with open(f_intersections) as f: reader = csv.DictReader(f) for r in reader: ps = p.parse('1000 ' + r['primary_rd']) print ps.road.dict for qr in gp.query(q, **ps.road.dict): print " ", qr
def test_address_files(self): import os from address_parser import Parser import csv parser = Parser() success = 0 failure = 0 total = 0 filename = "crime_addresses" f_input = os.path.join(os.path.dirname(__file__), 'support', filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support', filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 print '----' print line.strip() try: ps = parser.parse(line) if not ps: failure += 1 continue except Exception as e: print "ERROR", e failure += 1 continue print ps continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) #writer.writerow(d) print d.keys() if not ps.city: failure += 1 print d print ps print else: success += 1 print print "total={} success={} failure={} rate={}".format( total, success, failure, round((float(failure) / float(total) * 100), 3))
def test_hash(self): from pprint import pprint a1 = '119 WEST WINTON AVENUE, HAYWARD, CA, 94544' a2 = '119 Winton Ave., Hayward, Ca, 94544-5000' parser = Parser() r = parser.parse(a2) pprint(r.dict)
def __init__(self, partition, city_subs=None): from address_parser import Parser self.p = partition self.address_cache = {} self.city_subs = { k.lower(): v for k, v in city_subs.items()} if city_subs else {} self.parser = Parser()
def chunked_geocode(addresses, state=None, chunk_size=250): # Each address entry must be a tuple of (unique_id, address) parser = Parser() row_n = 0 request_rows = [] for uid, address_line in addresses: p = parser.parse(address_line) rr = [ uid, p.street_str(), p.locality.city, state or p.locality.state, p.locality.zip ] request_rows.append(rr) if len(request_rows) > chunk_size: for row in make_request(request_rows): # row colums are: # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips yield row_n, True, mkdict(row) row_n += 1 request_rows = [] for row in make_request(request_rows): # row colums are: # unique_id input_address match quality match_address latlon tiger_id side_of_street state_fips county_fips tract_fips block_fips yield row_n, True, mkdict(row) row_n += 1
import pandas as pd from scrapy.selector import Selector from bs4 import BeautifulSoup import re import time import requests import xml.etree.ElementTree as etree import pyap from urllib.parse import urlparse from selenium import webdriver from shutil import which from scrapy_selenium import SeleniumRequest from address_parser import Parser import requests address_parser = Parser() #yield SeleniumRequest(url=url, callback=self.parse_result) """SELENIUM_DRIVER_NAME = 'chrome' driver_path = which('/home/val/coding/chromedriver') SELENIUM_DRIVER_ARGUMENTS=['-headless']""" options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(executable_path='/home/val/coding/chromedriver', options=options) def fill_data_base(frame): # Automatic filling the data base with scraped information
from address_parser import Parser import re addresss = '387 View Ave apt4 Twin Falls, ID 83301' info = 'Email: [email protected] Phone: +1 (956) 8574114' email = re.findall('\S+@\S+', info) ok = re.findall('\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', info) parser = Parser() adr = parser.parse(addresss) print(f'{adr.number.number} {adr.road.direction} {adr.road.name} {adr.road.suffix}') print(f'{adr.text}') print(f'{email[0]}') print(f'{ok[0]}')
def get_pubmed_data(self, query, searched_zipcode, date, maximum_number_of_value=3): csv_data = { "affiliation": [], "number_of_authors": [], "authors_name": [], "authors_institute": [], "authors_address": [], "authors_zipcode": [], "paper_title": [], "publication_date": [], "journal": [] } pubmed = PubMed(tool="MyTool", email="*****@*****.**") parser = Parser() results = pubmed.query(query, max_results=maximum_number_of_value) is_queried_by_zipcode = searched_zipcode.isdecimal() if is_queried_by_zipcode: searched_zipcode = int(searched_zipcode) for article in results: jsonData = json.loads(article.toJSON()) authors_list = jsonData['authors'] authors_name = "" authors_institute = "" authors_affiliation = "" authors_address = "" authors_zipcode = "" num_authors = len(authors_list) or 0 counted_matched = 0 if is_queried_by_zipcode: counted_matched = self.has_match_zipcode_of_authprs( authors_list, searched_zipcode) if (not is_queried_by_zipcode) or (is_queried_by_zipcode and counted_matched > 0): for index in range(0, num_authors): affiliation = authors_list[index][ "affiliation"] or "<NOT_AVAILABLE>" zipcode = str(self.get_address_with_zipcode(affiliation)) # print(type(zipcode)) # print(zipcode) author_name = authors_list[index][ 'firstname'] + " " + authors_list[index][ "lastname"] or "<NOT_AVAILABLE>" author_institute = "" author_institute += self.get_organization( affiliation=affiliation) + " " authors_affiliation += affiliation authors_name += author_name authors_institute += author_institute authors_address += str(parser.parse(affiliation)) authors_zipcode += zipcode if num_authors != index + 1: authors_name += "||" authors_institute += "||" authors_affiliation += "||" authors_address += "||" authors_zipcode += "||" else: break paper_title = jsonData['title'] or "<NOT_AVAILABLE>" publication_date = jsonData['publication_date'] or "<NOT_AVAILABLE>" journal = jsonData['journal'] or "<NOT_AVAILABLE>" if self.is_us: if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): csv_data["authors_name"].append(authors_name) csv_data["affiliation"].append(authors_affiliation) csv_data["authors_institute"].append(authors_institute) csv_data["paper_title"].append(paper_title) csv_data["publication_date"].append(publication_date) csv_data["journal"].append(journal) csv_data["authors_address"].append(authors_address) csv_data["number_of_authors"].append(num_authors) csv_data["authors_zipcode"].append(authors_zipcode) self.is_us = False # if not is_queried_by_zipcode or (is_queried_by_zipcode and counted_matched > 0): # # df = pd.DataFrame(csv_data) # # print(df.head()) # df.to_csv("PubMedData_from.csv", index=False) print("Size of csv ", len(csv_data["paper_title"])) if len(csv_data["paper_title"]) > 0: df = pd.DataFrame(csv_data) print(df.head()) datetimeobject = datetime.datetime.strptime(date, '%Y/%m/%d') csv_file_name = "PubMedData_From_" + datetimeobject.strftime( '%Y_%m_%d') + ".csv" print(csv_file_name) df.to_csv(csv_file_name, index=False)
def test_address_files(self): import os import csv parser = Parser() success = 0 failure = 0 total = 0 for filename in ["crime_addresses"]: f_input = os.path.join(os.path.dirname(__file__), 'support',filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), 'support',filename + '.out.csv') with open(f_output, 'w') as out: writer = csv.DictWriter(out, self.header) writer.writeheader() with open(f_input) as f: for line in f: total += 1 try: ps = parser.parse(line) if not ps: failure += 1 continue except TypeError: raise except Exception as e: print("ERROR", e) failure += 1 continue d = ps.dict d['input'] = line.strip() d['output'] = str(ps) d2 = dict(d.items()) del d2['hash'] del d2['locality'] del d2['text'] del d2['road'] writer.writerow(d2) # THe parser strips 'BLOCK', and '/' is an intersection if line.strip() != str(ps) and 'block' not in line.lower() and '/' not in line: failure += 1 print('-----') print(line.strip()) print(ps) print() else: success += 1 print () print ("total={} success={} failure={} rate={}".format(total, success, failure, round((float(failure)/float(total)*100), 3)))