# Logging stuff. address_errors = [] # Use these maps to write out one error per source address/source name pair. source_map = {} # source_address => [source_names] source_address_map = {} # street_address => [source_addresses] """MAIN""" addresses = [] street_addresses_seen = set() address_tags = [] parser_address_tags = [] address_tag_strings = set() # Pipe-joined addr/key/value triples source_addresses = [] links = [] # dicts of address, relationship, address triples parser = PassyunkParser() parsed_addresses = {} if WRITE_OUT: print('Dropping indexes...') for table in (address_table, address_tag_table, source_address_table): table.drop_index('street_address') address_link_table.drop_index('address_1') address_link_table.drop_index('address_2') print('Deleting existing addresses...') address_table.delete() print('Deleting existing address tags...') address_tag_table.delete() print('Deleting existing source addresses...') source_address_table.delete()
import sys import json from passyunk.parser import PassyunkParser parser = PassyunkParser() parsed = parser.parse('253 PORT ROYAL') #parsed = parser.parse('PHILAREDEVELOPMENTAUTHORITYSOPHILLY') # print(parsed) print(json.dumps(parsed, sort_keys=True, indent=2)) ##53109644,09/09/2016 00:00:00,"ASSIGNMENT OF MORTGAGE","","SHELLPOINT MORTGAGE SERVICING","","",6311,"","","REGENT","ST","",""
# DOR CONDOMINIUM ERROR # ######################### print("Writing dor_condominium_error table...") dor_condominium_error_table = etl.fromdb(read_conn, 'select * from dor_condominium_error') \ .rename({'parcel_id': 'mapref', 'unit_num': 'condounit',}) \ .tooraclesde(write_dsn, dor_condo_error_table_name) ############################### # DOR PARCEL ADDRESS ANALYSIS # ############################### print("Performing dor_parcel address analysis...") import re from passyunk.parser import PassyunkParser street_name_re = re.compile('^[A-Z0-9 ]+$') unit_num_re = re.compile('^[A-Z0-9\-]+$') parser = PassyunkParser(MAX_RANGE=9999999) print('Reading streets...') street_rows = etl.fromdb( read_conn, 'select street_full, seg_id, street_code, left_from, left_to, right_from, right_to from street_segment' ) street_code_map = {} # street_full => street_code street_full_map = {} # street_code => street_full seg_map = {} # street_full => [seg rows] street_headers = street_rows[0] for street_row in street_rows[1:]: street_row = dict(zip(street_headers, street_row)) street_code = street_row['street_code'] street_full = street_row['street_full'] seg_map.setdefault(street_full, [])
import sys from pprint import pprint from passyunk.parser import PassyunkParser p = PassyunkParser() try: a = sys.argv[1] except IndexError: print('No address specified') r = p.parse(a) pprint(r)
def addresses_view(query): """ Looks up information about the address given in the query. Response is an object with the information for the matching address. The object includes: * A standardized, unambiguous address string * Address components * OPA # * DOR "ID" * L&I Key * Zoning something or other TODO: Give each address a score every time someone accesses it. This can be used for semi-intelligent ordering. For example, if I query for "440 Broad St", I'll most often mean the school district building. However, with default ordering, a building on S Broad with a whole bunch of units comes up first. That's annoying. But if 440 N Broad was accessed a bunch of times, it should have a higher popularity score than any one of those units, and that should help it to the top of the list. TODO: Allow paginator to use skip/limit semantics instead of or in addition to page. Maybe allow one of page or skip but not both. TODO: Need a way to only return addresses that have OPA numbers. Filters? """ query = query.strip('/') all_queries = list(filter(bool, (q.strip() for q in query.split(';')))) all_parsed = [PassyunkParser().parse(q) for q in all_queries] # Match a set of addresses. Filters will either be loose, where an omission # is ignored, or scrict, where an omission is treated as an explicit NULL. # For example, if the street_predir is omitted, then we should still match # all addresses that match the rest of the information; this is a loose # filter. However, if we do not provide an address_high, we should assume # that we're not looking for a ranged address; this is a strict filter. all_addresses = None for parsed in all_parsed: unit_type = parsed['components']['unit']['unit_type'] unit_num = parsed['components']['unit']['unit_num'] high_num = parsed['components']['address']['high_num_full'] low_num = parsed['components']['address']['low_num'] loose_filters = NotNoneDict( street_name=parsed['components']['street']['name'], address_low=low_num if low_num is not None else parsed['components']['address']['full'], address_low_suffix=parsed['components']['address']['addr_suffix'], address_low_frac=parsed['components']['address']['fractional'], street_predir=parsed['components']['street']['predir'], street_postdir=parsed['components']['street']['postdir'], street_suffix=parsed['components']['street']['suffix'], ) strict_filters = dict( address_high=high_num, unit_num=unit_num if unit_num or not unit_type else '', ) addresses = AddressSummary.query\ .filter_by(**loose_filters, **strict_filters)\ .filter_by_unit_type(unit_type)\ .include_child_units( 'include_units' in request.args, is_range=high_num is not None, is_unit=unit_type is not None)\ .exclude_non_opa('opa_only' in request.args) if all_addresses is None: all_addresses = addresses else: all_addresses = all_addresses.union(addresses) all_addresses = all_addresses.order_by_address() paginator = QueryPaginator(all_addresses) # Ensure that we have results normalized_addresses = [ parsed['components']['street_address'] for parsed in all_parsed ] addresses_count = paginator.collection_size if addresses_count == 0: error = json_error(404, 'Could not find addresses matching query.', { 'query': query, 'normalized': normalized_addresses }) return json_response(response=error, status=404) # Validate the pagination page_num, error = validate_page_param(request, paginator) if error: return json_response(response=error, status=error['status']) # Render the response addresses_page = paginator.get_page(page_num) serializer = AddressJsonSerializer( metadata={ 'query': query, 'normalized': normalized_addresses }, pagination=paginator.get_page_info(page_num)) result = serializer.serialize_many(addresses_page) return json_response(response=result, status=200)
# DOR CONDOMINIUM ERROR # ######################### print("Writing dor_condominium_error table...") dor_condominium_error_table = etl.fromdb(read_conn, 'select * from dor_condominium_error') \ .rename({'parcel_id': 'mapref', 'unit_num': 'condounit',}) \ .tooraclesde(write_dsn, dor_condo_error_table_name) ############################### # DOR PARCEL ADDRESS ANALYSIS # ############################### print("Performing dor_parcel address analysis...") import re from passyunk.parser import PassyunkParser street_name_re = re.compile('^[A-Z0-9 ]+$') unit_num_re = re.compile('^[A-Z0-9\-]+$') parser = PassyunkParser(MAX_RANGE=9999999) print('Reading streets...') street_rows = etl.fromdb(read_conn, 'select street_full, seg_id, street_code, left_from, left_to, right_from, right_to from street_segment') street_code_map = {} # street_full => street_code street_full_map = {} # street_code => street_full seg_map = {} # street_full => [seg rows] street_headers = street_rows[0] for street_row in street_rows[1:]: street_row = dict(zip(street_headers, street_row)) street_code = street_row['street_code'] street_full = street_row['street_full'] seg_map.setdefault(street_full, []) seg_map[street_full].append(street_row) street_code_map[street_full] = street_code
def block_view(query): """ Looks up information about the 100-range that the given address falls within. TODO: Consider matching the segment ID and finding the low and high. This would be instead of hardcoding a low of 0 and high of 100. Maybe this would go at a new route, like `segment` or `block-face`. """ query = query.strip('/') parsed = PassyunkParser().parse(query) normalized_address = parsed['components']['street_address'] # Ensure that we can get a valid address number try: address_num = int( parsed['components']['address']['low_num'] if parsed['components']['address']['low_num'] is not None else parsed['components']['address']['full']) except ValueError: error = json_error(400, 'No valid block number provided.', { 'query': query, 'normalized': normalized_address }) return json_response(response=error, status=400) # Match a set of addresses block_num = ((address_num // 100) * 100) filters = NotNoneDict( street_name=parsed['components']['street']['name'], street_predir=parsed['components']['street']['predir'], street_postdir=parsed['components']['street']['postdir'], street_suffix=parsed['components']['street']['suffix'], ) addresses = AddressSummary.query\ .filter_by(**filters)\ .filter(AddressSummary.address_low >= block_num)\ .filter(AddressSummary.address_low < block_num + 100)\ .exclude_children()\ .exclude_non_opa('opa_only' in request.args) addresses = addresses.order_by_address() paginator = QueryPaginator(addresses) # Ensure that we have results addresses_count = paginator.collection_size if addresses_count == 0: error = json_error( 404, 'Could not find any address on a block matching query.', { 'query': query, 'normalized': normalized_address }) return json_response(response=error, status=404) # Validate the pagination page_num, error = validate_page_param(request, paginator) if error: return json_response(response=error, status=error['status']) # Render the response block_page = paginator.get_page(page_num) serializer = AddressJsonSerializer( metadata={ 'query': query, 'normalized': normalized_address }, pagination=paginator.get_page_info(page_num)) result = serializer.serialize_many(block_page) return json_response(response=result, status=200)
import petl as etl import cx_Oracle from passyunk.namestd import StandardName from passyunk.parser import PassyunkParser from passyunk.config import get_dsn parser = PassyunkParser() outfile = "landmarks.csv" dsn = get_dsn('gsg') dbo = cx_Oracle.connect(dsn) def standardize(tmp): tmp = tmp.strip().upper() # Name standardization: # tmp_list = re.sub('[' + string.punctuation + ']', '', tmp).split() tmp_list = tmp.split() std = StandardName(tmp_list, False).output # Don't match on 'the' if first word tmp = ' '.join(std) return tmp stmt = ''' with places as ( select name, address, globalid from namedplaces_polygons where address is not null and substr(name,1,1) NOT IN ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') and public_ = 'Y' union select name, address, globalid from namedplaces_points where address is not null and substr(name,1,1) NOT IN ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') and public_ = 'Y' ) ,
import re import csv import os from collections import OrderedDict import petl as etl import cx_Oracle import boto3 from passyunk.parser import PassyunkParser from config import get_dsn, get_bucket month = '2018_09' parser = PassyunkParser() # Input locations loc = r'C:/Projects/etl/data/usps/' csbyst = '/pa.txt' zip4 = '/pa' # Output params s3_bucket = get_bucket() dsn = get_dsn('ais') connection = cx_Oracle.Connection(dsn) zip4_write_table_name = 'USPS_ZIP4S' cityzip_write_table_name = 'USPS_CITYZIP' alias_write_table_name = 'USPS_ALIAS' address_standardization_report_table_name = 'USPS_ZIP4_ADDRESS_CHECK' alias_outfile_path = alias_write_table_name.lower() + '.csv' cityzip_outfile_path = cityzip_write_table_name.lower() + '.csv' zip4_outfile_path = zip4_write_table_name.lower() + '.csv' temp_zip4_outfile_path = 'T_' + zip4_outfile_path #####################################
import re import csv import os from collections import OrderedDict import petl as etl import cx_Oracle import boto3 from passyunk.parser import PassyunkParser from config import get_dsn, get_bucket month = '2020_03' parser = PassyunkParser() # Input locations loc = r'C:/Projects/etl/data/usps/' csbyst = '/pa.txt' zip4 = '/pa' # Output params s3_bucket = get_bucket() dsn = get_dsn('ais') connection = cx_Oracle.Connection(dsn) zip4_write_table_name = 'USPS_ZIP4S' cityzip_write_table_name = 'USPS_CITYZIP' alias_write_table_name = 'USPS_ALIAS' address_standardization_report_table_name = 'USPS_ZIP4_ADDRESS_CHECK' alias_outfile_path = alias_write_table_name.lower() + '.csv' cityzip_outfile_path = cityzip_write_table_name.lower() + '.csv' zip4_outfile_path = zip4_write_table_name.lower() + '.csv' temp_zip4_outfile_path = 'T_' + zip4_outfile_path #####################################
plates = None plates_counter = None ticket_numbers = None ticket_numbers_counter = None centroids = None geocode_stats = { 'total': 0, 'success': 0, 'gps': 0, 'zip': 0, 'failed_address': 0, 'failed_segment': 0, 'failed_segments': set() } passyunk_parser = PassyunkParser() fieldmap = OrderedDict([('anon_ticket_number', { 'type': 'string', 'start_pos': 0, 'end_pos': 11 }), ('issue_datetime', { 'type': 'datetime', 'start_pos': 11, 'end_pos': 26 }), ('state', { 'type': 'string', 'start_pos': 26, 'end_pos': 28 }), ('anon_plate_id', { 'type': 'string',