Example #1
0
# Logging stuff.
address_errors = []
# Use these maps to write out one error per source address/source name pair.
source_map = {}  # source_address => [source_names]
source_address_map = {}  # street_address => [source_addresses]

"""MAIN"""

addresses = []
street_addresses_seen = set()
address_tags = []
parser_address_tags = []
address_tag_strings = set()  # Pipe-joined addr/key/value triples
source_addresses = []
links = []  # dicts of address, relationship, address triples
parser = PassyunkParser()
parsed_addresses = {}

if WRITE_OUT:
    print('Dropping indexes...')
    for table in (address_table, address_tag_table, source_address_table):
        table.drop_index('street_address')
    address_link_table.drop_index('address_1')
    address_link_table.drop_index('address_2')

    print('Deleting existing addresses...')
    address_table.delete()
    print('Deleting existing address tags...')
    address_tag_table.delete()
    print('Deleting existing source addresses...')
    source_address_table.delete()
Example #2
0
import sys
import json
from passyunk.parser import PassyunkParser

parser = PassyunkParser()

parsed = parser.parse('253 PORT ROYAL')
#parsed = parser.parse('PHILAREDEVELOPMENTAUTHORITYSOPHILLY')
# print(parsed)

print(json.dumps(parsed, sort_keys=True, indent=2))

##53109644,09/09/2016 00:00:00,"ASSIGNMENT OF MORTGAGE","","SHELLPOINT MORTGAGE SERVICING","","",6311,"","","REGENT","ST","",""
Example #3
0
# DOR CONDOMINIUM ERROR #
#########################
print("Writing dor_condominium_error table...")
dor_condominium_error_table = etl.fromdb(read_conn, 'select * from dor_condominium_error') \
    .rename({'parcel_id': 'mapref', 'unit_num': 'condounit',}) \
    .tooraclesde(write_dsn, dor_condo_error_table_name)
###############################
# DOR PARCEL ADDRESS ANALYSIS #
###############################
print("Performing dor_parcel address analysis...")
import re
from passyunk.parser import PassyunkParser

street_name_re = re.compile('^[A-Z0-9 ]+$')
unit_num_re = re.compile('^[A-Z0-9\-]+$')
parser = PassyunkParser(MAX_RANGE=9999999)

print('Reading streets...')
street_rows = etl.fromdb(
    read_conn,
    'select street_full, seg_id, street_code, left_from, left_to, right_from, right_to from street_segment'
)
street_code_map = {}  # street_full => street_code
street_full_map = {}  # street_code => street_full
seg_map = {}  # street_full => [seg rows]
street_headers = street_rows[0]
for street_row in street_rows[1:]:
    street_row = dict(zip(street_headers, street_row))
    street_code = street_row['street_code']
    street_full = street_row['street_full']
    seg_map.setdefault(street_full, [])
Example #4
0
import sys
from pprint import pprint
from passyunk.parser import PassyunkParser

p = PassyunkParser()
try:
    a = sys.argv[1]
except IndexError:
    print('No address specified')

r = p.parse(a)
pprint(r)
Example #5
0
# Logging stuff.
address_errors = []
# Use these maps to write out one error per source address/source name pair.
source_map = {}  # source_address => [source_names]
source_address_map = {}  # street_address => [source_addresses]

"""MAIN"""

addresses = []
street_addresses_seen = set()
address_tags = []
parser_address_tags = []
address_tag_strings = set()  # Pipe-joined addr/key/value triples
source_addresses = []
links = []  # dicts of address, relationship, address triples
parser = PassyunkParser()
parsed_addresses = {}

if WRITE_OUT:
    print('Dropping indexes...')
    for table in (address_table, address_tag_table, source_address_table):
        table.drop_index('street_address')
    address_link_table.drop_index('address_1')
    address_link_table.drop_index('address_2')

    print('Deleting existing addresses...')
    address_table.delete()
    print('Deleting existing address tags...')
    address_tag_table.delete()
    print('Deleting existing source addresses...')
    source_address_table.delete()
Example #6
0
def addresses_view(query):
    """
    Looks up information about the address given in the query. Response is an
    object with the information for the matching address. The object includes:
    * A standardized, unambiguous address string
    * Address components
    * OPA #
    * DOR "ID"
    * L&I Key
    * Zoning something or other

    TODO: Give each address a score every time someone accesses it. This can be
          used for semi-intelligent ordering. For example, if I query for "440
          Broad St", I'll most often mean the school district building. However,
          with default ordering, a building on S Broad with a whole bunch of
          units comes up first. That's annoying. But if 440 N Broad was accessed
          a bunch of times, it should have a higher popularity score than any
          one of those units, and that should help it to the top of the list.

    TODO: Allow paginator to use skip/limit semantics instead of or in addition
          to page. Maybe allow one of page or skip but not both.

    TODO: Need a way to only return addresses that have OPA numbers. Filters?

    """
    query = query.strip('/')

    all_queries = list(filter(bool, (q.strip() for q in query.split(';'))))
    all_parsed = [PassyunkParser().parse(q) for q in all_queries]

    # Match a set of addresses. Filters will either be loose, where an omission
    # is ignored, or scrict, where an omission is treated as an explicit NULL.
    # For example, if the street_predir is omitted, then we should still match
    # all addresses that match the rest of the information; this is a loose
    # filter. However, if we do not provide an address_high, we should assume
    # that we're not looking for a ranged address; this is a strict filter.
    all_addresses = None

    for parsed in all_parsed:
        unit_type = parsed['components']['unit']['unit_type']
        unit_num = parsed['components']['unit']['unit_num']
        high_num = parsed['components']['address']['high_num_full']
        low_num = parsed['components']['address']['low_num']

        loose_filters = NotNoneDict(
            street_name=parsed['components']['street']['name'],
            address_low=low_num if low_num is not None else
            parsed['components']['address']['full'],
            address_low_suffix=parsed['components']['address']['addr_suffix'],
            address_low_frac=parsed['components']['address']['fractional'],
            street_predir=parsed['components']['street']['predir'],
            street_postdir=parsed['components']['street']['postdir'],
            street_suffix=parsed['components']['street']['suffix'],
        )
        strict_filters = dict(
            address_high=high_num,
            unit_num=unit_num if unit_num or not unit_type else '',
        )

        addresses = AddressSummary.query\
            .filter_by(**loose_filters, **strict_filters)\
            .filter_by_unit_type(unit_type)\
            .include_child_units(
                'include_units' in request.args,
                is_range=high_num is not None,
                is_unit=unit_type is not None)\
            .exclude_non_opa('opa_only' in request.args)

        if all_addresses is None:
            all_addresses = addresses
        else:
            all_addresses = all_addresses.union(addresses)

    all_addresses = all_addresses.order_by_address()
    paginator = QueryPaginator(all_addresses)

    # Ensure that we have results
    normalized_addresses = [
        parsed['components']['street_address'] for parsed in all_parsed
    ]
    addresses_count = paginator.collection_size
    if addresses_count == 0:
        error = json_error(404, 'Could not find addresses matching query.', {
            'query': query,
            'normalized': normalized_addresses
        })
        return json_response(response=error, status=404)

    # Validate the pagination
    page_num, error = validate_page_param(request, paginator)
    if error:
        return json_response(response=error, status=error['status'])

    # Render the response
    addresses_page = paginator.get_page(page_num)
    serializer = AddressJsonSerializer(
        metadata={
            'query': query,
            'normalized': normalized_addresses
        },
        pagination=paginator.get_page_info(page_num))
    result = serializer.serialize_many(addresses_page)
    return json_response(response=result, status=200)
Example #7
0
# DOR CONDOMINIUM ERROR #
#########################
print("Writing dor_condominium_error table...")
dor_condominium_error_table = etl.fromdb(read_conn, 'select * from dor_condominium_error') \
    .rename({'parcel_id': 'mapref', 'unit_num': 'condounit',}) \
    .tooraclesde(write_dsn, dor_condo_error_table_name)
###############################
# DOR PARCEL ADDRESS ANALYSIS #
###############################
print("Performing dor_parcel address analysis...")
import re
from passyunk.parser import PassyunkParser

street_name_re = re.compile('^[A-Z0-9 ]+$')
unit_num_re = re.compile('^[A-Z0-9\-]+$')
parser = PassyunkParser(MAX_RANGE=9999999)

print('Reading streets...')
street_rows = etl.fromdb(read_conn,
                         'select street_full, seg_id, street_code, left_from, left_to, right_from, right_to from street_segment')
street_code_map = {}  # street_full => street_code
street_full_map = {}  # street_code => street_full
seg_map = {}  # street_full => [seg rows]
street_headers = street_rows[0]
for street_row in street_rows[1:]:
    street_row = dict(zip(street_headers, street_row))
    street_code = street_row['street_code']
    street_full = street_row['street_full']
    seg_map.setdefault(street_full, [])
    seg_map[street_full].append(street_row)
    street_code_map[street_full] = street_code
Example #8
0
def block_view(query):
    """
    Looks up information about the 100-range that the given address falls
    within.

    TODO: Consider matching the segment ID and finding the low and high. This
          would be instead of hardcoding a low of 0 and high of 100. Maybe this
          would go at a new route, like `segment` or `block-face`.
    """
    query = query.strip('/')

    parsed = PassyunkParser().parse(query)
    normalized_address = parsed['components']['street_address']

    # Ensure that we can get a valid address number
    try:
        address_num = int(
            parsed['components']['address']['low_num']
            if parsed['components']['address']['low_num'] is not None else
            parsed['components']['address']['full'])
    except ValueError:
        error = json_error(400, 'No valid block number provided.', {
            'query': query,
            'normalized': normalized_address
        })
        return json_response(response=error, status=400)

    # Match a set of addresses
    block_num = ((address_num // 100) * 100)
    filters = NotNoneDict(
        street_name=parsed['components']['street']['name'],
        street_predir=parsed['components']['street']['predir'],
        street_postdir=parsed['components']['street']['postdir'],
        street_suffix=parsed['components']['street']['suffix'],
    )
    addresses = AddressSummary.query\
        .filter_by(**filters)\
        .filter(AddressSummary.address_low >= block_num)\
        .filter(AddressSummary.address_low < block_num + 100)\
        .exclude_children()\
        .exclude_non_opa('opa_only' in request.args)

    addresses = addresses.order_by_address()
    paginator = QueryPaginator(addresses)

    # Ensure that we have results
    addresses_count = paginator.collection_size
    if addresses_count == 0:
        error = json_error(
            404, 'Could not find any address on a block matching query.', {
                'query': query,
                'normalized': normalized_address
            })
        return json_response(response=error, status=404)

    # Validate the pagination
    page_num, error = validate_page_param(request, paginator)
    if error:
        return json_response(response=error, status=error['status'])

    # Render the response
    block_page = paginator.get_page(page_num)
    serializer = AddressJsonSerializer(
        metadata={
            'query': query,
            'normalized': normalized_address
        },
        pagination=paginator.get_page_info(page_num))
    result = serializer.serialize_many(block_page)
    return json_response(response=result, status=200)
Example #9
0
import petl as etl
import cx_Oracle
from passyunk.namestd import StandardName
from passyunk.parser import PassyunkParser
from passyunk.config import get_dsn

parser = PassyunkParser()
outfile = "landmarks.csv"
dsn = get_dsn('gsg')
dbo = cx_Oracle.connect(dsn)


def standardize(tmp):
    tmp = tmp.strip().upper()
    # Name standardization:
    # tmp_list = re.sub('[' + string.punctuation + ']', '', tmp).split()
    tmp_list = tmp.split()
    std = StandardName(tmp_list, False).output
    # Don't match on 'the' if first word
    tmp = ' '.join(std)
    return tmp


stmt = '''
with places as 
(
select name, address, globalid from namedplaces_polygons where address is not null and substr(name,1,1) NOT IN ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') and public_ = 'Y'
union
select name, address, globalid from namedplaces_points where address is not null and substr(name,1,1) NOT IN ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9') and public_ = 'Y'
)
,
import re
import csv
import os
from collections import OrderedDict
import petl as etl
import cx_Oracle
import boto3
from passyunk.parser import PassyunkParser
from config import get_dsn, get_bucket

month = '2018_09'
parser = PassyunkParser()

# Input locations
loc = r'C:/Projects/etl/data/usps/'
csbyst = '/pa.txt'
zip4 = '/pa'

# Output params
s3_bucket = get_bucket()
dsn = get_dsn('ais')
connection = cx_Oracle.Connection(dsn)
zip4_write_table_name = 'USPS_ZIP4S'
cityzip_write_table_name = 'USPS_CITYZIP'
alias_write_table_name = 'USPS_ALIAS'
address_standardization_report_table_name = 'USPS_ZIP4_ADDRESS_CHECK'
alias_outfile_path = alias_write_table_name.lower() + '.csv'
cityzip_outfile_path = cityzip_write_table_name.lower() + '.csv'
zip4_outfile_path = zip4_write_table_name.lower() + '.csv'
temp_zip4_outfile_path = 'T_' + zip4_outfile_path
#####################################
Example #11
0
import re
import csv
import os
from collections import OrderedDict
import petl as etl
import cx_Oracle
import boto3
from passyunk.parser import PassyunkParser
from config import get_dsn, get_bucket

month = '2020_03'
parser = PassyunkParser()

# Input locations
loc = r'C:/Projects/etl/data/usps/'
csbyst = '/pa.txt'
zip4 = '/pa'

# Output params
s3_bucket = get_bucket()
dsn = get_dsn('ais')
connection = cx_Oracle.Connection(dsn)
zip4_write_table_name = 'USPS_ZIP4S'
cityzip_write_table_name = 'USPS_CITYZIP'
alias_write_table_name = 'USPS_ALIAS'
address_standardization_report_table_name = 'USPS_ZIP4_ADDRESS_CHECK'
alias_outfile_path = alias_write_table_name.lower() + '.csv'
cityzip_outfile_path = cityzip_write_table_name.lower() + '.csv'
zip4_outfile_path = zip4_write_table_name.lower() + '.csv'
temp_zip4_outfile_path = 'T_' + zip4_outfile_path
#####################################
plates = None
plates_counter = None
ticket_numbers = None
ticket_numbers_counter = None
centroids = None
geocode_stats = {
    'total': 0,
    'success': 0,
    'gps': 0,
    'zip': 0,
    'failed_address': 0,
    'failed_segment': 0,
    'failed_segments': set()
}

passyunk_parser = PassyunkParser()

fieldmap = OrderedDict([('anon_ticket_number', {
    'type': 'string',
    'start_pos': 0,
    'end_pos': 11
}), ('issue_datetime', {
    'type': 'datetime',
    'start_pos': 11,
    'end_pos': 26
}), ('state', {
    'type': 'string',
    'start_pos': 26,
    'end_pos': 28
}), ('anon_plate_id', {
    'type': 'string',