Exemple #1
0
def _normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # now parse the address into number, street name and street type
    parser = StreetAddressParser()
    addr = parser.parse(str(address_val))  # TODO: should probably use unicode()
    normalized_address = ''

    if not addr:
        return None

    if 'house' in addr and addr['house'] is not None:
        normalized_address = addr['house'].lstrip("0") #some addresses have leading zeros, strip them here

    if 'street_name' in addr and addr['street_name'] is not None:
        normalized_address = normalized_address + ' ' + addr['street_name']

    if 'street_type' in addr and addr['street_type'] is not None:
        normalized_address = normalized_address + ' ' + addr['street_type']

    formatter = StreetAddressFormatter()
    normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)
 
    return normalized_address.lower().strip()
Exemple #2
0
 def PreProcessing_records(dt):
    '''
    return corect formatt
    
    ''' 
    dt.Address = dt.Address.apply(lambda x: str(x).upper())    # uppercase
    dt.Address = dt.Address.str.rstrip()     # Remove spaces 
    
    dt_Address_ar = np.asarray(dt.Address)
    vfunc_TH = np.vectorize(USAddressFormatter_TH)        # Custome formatter, correct wrong combination of number + TH
    dt_Address = vfunc_TH(dt_Address_ar)
    dt_Address_list = dt_Address.tolist()
    dt['Address'] = dt_Address_list
    dt['Address'] = dt.Address.str.rstrip()

    dt_Address_ar2 = np.asarray(dt.Address)    
    vfunc_Abbr = np.vectorize(USAddressFormatter_Abbr)    # Custom formatter, replace wrong abbreviation
    dt_Address2 = vfunc_Abbr(dt_Address_ar2)
    dt_Address2_list = dt_Address2.tolist()
    dt['Address'] = dt_Address2_list
    dt['Address'] = dt.Address.str.rstrip()
    
    dt_Address_ar3 = np.asarray(dt.Address)
    vfunc_Sym = np.vectorize(USAddressFormatter_Symbol)   # Customer formatter, remove useless symbols
    dt_Address3 = vfunc_Sym(dt_Address_ar3)
    dt_Address3_list = dt_Address3.tolist()
    dt['Address'] = dt_Address3_list
    dt['Address'] = dt.Address.str.rstrip()
    
    abbr_formatter = StreetAddressFormatter()   # load python package 'Formatter'
    for item in dt.Address:
        item = abbr_formatter.abbrev_direction(item)
        item = abbr_formatter.abbrev_street_avenue_etc(item)
    
    dt.Address = dt.Address.apply(lambda x: str(x).upper())  # uppercase again
    
    return dt
    
    
    
    
    
    
    
    boundary_list = []
class TestStreetAddress(unittest.TestCase):
    def setUp(self):
        self.addr_parser = StreetAddressParser()
        self.addr_formatter = StreetAddressFormatter()


    def test_success_abbrev_street_avenue_etc(self):
        addr = self.addr_parser.parse('221B Baker Street')
        eq_(self.addr_formatter.abbrev_street_avenue_etc(addr['street_full']), 'Baker St')
class TestStreetAddress(unittest.TestCase):
    def setUp(self):
        self.addr_parser = StreetAddressParser()
        self.addr_formatter = StreetAddressFormatter()

    def test_success_abbrev_street_avenue_etc(self):
        addr = self.addr_parser.parse('221B Baker Street')
        eq_(self.addr_formatter.abbrev_street_avenue_etc(addr['street_full']),
            'Baker St')
Exemple #5
0
def _normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # now parse the address into number, street name and street type
    addr = usaddress.tag(str(address_val))[0]  # TODO: should probably use unicode()
    normalized_address = ''

    if not addr:
        return None

    if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
        normalized_address = addr['AddressNumber'].lstrip("0")  # some addresses have leading zeros, strip them here

    if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None:
        normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePreDirectional'])

    if 'StreetName' in addr and addr['StreetName'] is not None:
        normalized_address = normalized_address + ' ' + addr['StreetName']

    if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None:
        # remove any periods from abbreviations
        normalized_address = normalized_address + ' ' + _normalize_address_post_type(addr['StreetNamePostType'])

    if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None:
        normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePostDirectional'])

    formatter = StreetAddressFormatter()
    normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)

    return normalized_address.lower().strip()
Exemple #6
0
def _normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # now parse the address into number, street name and street type
    parser = StreetAddressParser()
    addr = parser.parse(
        str(address_val))  # TODO: should probably use unicode()
    normalized_address = ''

    if not addr:
        return None

    if 'house' in addr and addr['house'] is not None:
        normalized_address = addr['house'].lstrip(
            "0")  # some addresses have leading zeros, strip them here

    if 'street_name' in addr and addr['street_name'] is not None:
        normalized_address = normalized_address + ' ' + addr['street_name']

    if 'street_type' in addr and addr['street_type'] is not None:
        normalized_address = normalized_address + ' ' + addr['street_type']

    formatter = StreetAddressFormatter()
    normalized_address = formatter.abbrev_street_avenue_etc(normalized_address)

    return normalized_address.lower().strip()
Exemple #7
0
def normalize_address_str(address_val, address_val_2, postal_code, extra_data):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """
    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    # if this is a byte string, then convert to a string-string
    if isinstance(address_val, bytes):
        address_val = address_val.decode('utf-8')
    elif not isinstance(address_val, str):
        address_val = str(address_val)
    else:
        pass

    # Do some string replacements to remove odd characters that we come across
    replacements = {
        '\xef\xbf\xbd': '',
        '\uFFFD': '',
    }
    for k, v in replacements.items():
        address_val = address_val.replace(k, v)
    # Remove lots, they are not part of a real address
    has_lot = re.split(',*\s[lL]ot\s', address_val)
    if has_lot:
        address_val = has_lot[0]

    # now parse the address into number, street name and street type
    try:
        # Add in the mapping of CornerOf to the AddressNumber.
        if address_val_2 and ('lot' not in address_val_2):
            addr = usaddress.tag(str(address_val + ' ' + address_val_2),
                                 tag_mapping={'CornerOf': 'AddressNumber'})[0]
        else:
            addr = usaddress.tag(str(address_val),
                                 tag_mapping={'CornerOf': 'AddressNumber'})[0]

    except usaddress.RepeatedLabelError:
        # usaddress can't parse this at all
        normalized_address = str(address_val)
    except UnicodeEncodeError:
        # Some kind of odd character issue that we are not handling yet.
        normalized_address = str(address_val)
    else:
        # Address can be parsed, so let's format it.
        normalized_address = ''
        street_name = ''
        extra_data['StreetNumber'] = extra_data['StreetName'] = extra_data[
            'StreetNamePreDirectional'] = extra_data[
                'StreetSuffix'] = extra_data['StreetDirSuffix'] = extra_data[
                    'UnitNumber'] = ''

        if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
            normalized_address = _normalize_address_number(
                addr['AddressNumber'])

        if 'AddressNumberSuffix' in addr and addr[
                'AddressNumberSuffix'] is not None:
            normalized_address = normalized_address + addr[
                'AddressNumberSuffix']

        extra_data['StreetNumber'] = normalized_address

        if 'StreetNamePreDirectional' in addr and addr[
                'StreetNamePreDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePreDirectional'])  # NOQA
            extra_data['StreetDirPrefix'] = _normalize_address_direction(
                addr['StreetNamePreDirectional'])

        if 'StreetNamePreModifier' in addr and addr[
                'StreetNamePreModifier'] is not None:
            normalized_address = normalized_address + ' ' + addr[
                'StreetNamePreModifier']
            street_name = street_name + addr['StreetNamePreModifier'] + ' '

        if 'StreetNamePreType' in addr and addr[
                'StreetNamePreType'] is not None:
            normalized_address = normalized_address + ' ' + addr[
                'StreetNamePreType']
            street_name = street_name + addr['StreetNamePreType'] + ' '

        if 'StreetName' in addr and addr['StreetName'] is not None:
            normalized_address = normalized_address + ' ' + addr['StreetName']
            street_name = street_name + addr['StreetName']

        if 'StreetNamePostType' in addr and addr[
                'StreetNamePostType'] is not None:
            # remove any periods from abbreviations
            normalized_address = normalized_address + ' ' + _normalize_address_post_type(
                addr['StreetNamePostType'])  # NOQA
            extra_data['StreetSuffix'] = _normalize_address_direction(
                addr['StreetNamePostType'])

        if 'StreetNamePostDirectional' in addr and addr[
                'StreetNamePostDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePostDirectional'])  # NOQA
            extra_data['StreetDirSuffix'] = _normalize_address_direction(
                addr['StreetNamePostDirectional'])

        if 'SubaddressType' in addr and addr['SubaddressType'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_secondary_address(
                addr['SubaddressType'])

        if 'SubaddressIdentifier' in addr and addr[
                'SubaddressIdentifier'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_number(
                addr['SubaddressIdentifier'])

        if 'OccupancyType' in addr and addr['OccupancyType'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_secondary_address(
                addr['OccupancyType'])

        if 'OccupancyIdentifier' in addr and addr[
                'OccupancyIdentifier'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_number(
                addr['OccupancyIdentifier'])
            extra_data['UnitNumber'] = _normalize_address_number(
                addr['OccupancyIdentifier'])

        formatter = StreetAddressFormatter()
        normalized_address = formatter.abbrev_street_avenue_etc(
            normalized_address)
        normalized_address = normalized_address + ' ' + postal_code
        street_name = formatter.abbrev_street_avenue_etc(street_name)
        extra_data['StreetName'] = street_name

    return normalized_address.lower().strip(), extra_data
Exemple #8
0
import pickle
import requests

#Imports for ParseAddress
import usaddress
from streetaddress import StreetAddressFormatter
from nltk.tag.stanford import StanfordNERTagger as Tagger
from geopy.geocoders import GoogleV3, Nominatim
import nltk
import geopy

tagger = Tagger(
    'stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
    'stanford-ner-2015-12-09/stanford-ner.jar')

addr_formatter = StreetAddressFormatter()


#a queue for storing processes
class Queue:
    def __init__(self):
        self.internal_list = []

    def put(self, data):
        self.internal_list.append(data)

    def get(self):
        if self.internal_list != []:
            data = self.internal_list[0]
            del self.internal_list[0]
            return data
Exemple #9
0
def normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    address_val = unicode(address_val).encode('utf-8')

    # Do some string replacements to remove odd characters that we come across
    replacements = {
        '\xef\xbf\xbd': '',
        '\uFFFD': '',
    }
    for k, v in replacements.items():
        address_val = address_val.replace(k, v)

    # now parse the address into number, street name and street type
    try:
        # Add in the mapping of CornerOf to the AddressNumber.
        addr = usaddress.tag(str(address_val),
                             tag_mapping={'CornerOf': 'AddressNumber'})[0]
    except usaddress.RepeatedLabelError:
        # usaddress can't parse this at all
        normalized_address = str(address_val)
    except UnicodeEncodeError:
        # Some kind of odd character issue that we are not handling yet.
        normalized_address = str(address_val)
    else:
        # Address can be parsed, so let's format it.
        normalized_address = ''

        if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
            normalized_address = _normalize_address_number(
                addr['AddressNumber'])

        if 'StreetNamePreDirectional' in addr and addr[
                'StreetNamePreDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePreDirectional'])  # NOQA

        if 'StreetName' in addr and addr['StreetName'] is not None:
            normalized_address = normalized_address + ' ' + addr['StreetName']

        if 'StreetNamePostType' in addr and addr[
                'StreetNamePostType'] is not None:
            # remove any periods from abbreviations
            normalized_address = normalized_address + ' ' + _normalize_address_post_type(
                addr['StreetNamePostType'])  # NOQA

        if 'StreetNamePostDirectional' in addr and addr[
                'StreetNamePostDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePostDirectional'])  # NOQA

        if 'OccupancyType' in addr and addr['OccupancyType'] is not None:
            normalized_address = normalized_address + ' ' + addr[
                'OccupancyType']

        if 'OccupancyIdentifier' in addr and addr[
                'OccupancyIdentifier'] is not None:
            normalized_address = normalized_address + ' ' + addr[
                'OccupancyIdentifier']

        formatter = StreetAddressFormatter()
        normalized_address = formatter.abbrev_street_avenue_etc(
            normalized_address)

    return normalized_address.lower().strip()
 def setUp(self):
     self.addr_parser = StreetAddressParser()
     self.addr_formatter = StreetAddressFormatter()
        2081 N. Webb Rd
        1515 West 22nd Street
        2029 Stierlin Court
        P.O. Box 33170
        The Landmark @ One Market, Suite 200
        One Market, Suite 200
        One Market
        One Union Square
        One Union Square, Apt 22-C
        186 Avenue A
        10 Avenue of America
        25 West St
        """.split("\n")

    addr_parser = StreetAddressParser()
    addr_formatter = StreetAddressFormatter()

    if opts.addr:
        lst = [opts.addr]
    else:
        lst = map(str.strip, tests)

    for t in lst:
        if t:
            print '"%s"' % t
            logging.info('addr_str: ' + unicode(t))
            addr = addr_parser.parse(t)

            if addr['street_full'] is not None:
                street = addr_formatter.append_TH_to_street(
                    addr['street_full'])
        2081 N. Webb Rd
        1515 West 22nd Street
        2029 Stierlin Court
        P.O. Box 33170
        The Landmark @ One Market, Suite 200
        One Market, Suite 200
        One Market
        One Union Square
        One Union Square, Apt 22-C
        186 Avenue A
        10 Avenue of America
        25 West St
        """.split("\n")

    addr_parser = StreetAddressParser()
    addr_formatter = StreetAddressFormatter()

    if opts.addr:
        lst = [opts.addr]
    else:
        lst = map(str.strip,tests)

    for t in lst:
        if t:
            print '"%s"' % t
            logging.info('addr_str: ' + unicode(t))
            addr = addr_parser.parse(t)

            if addr['street_full'] is not None:
                street = addr_formatter.append_TH_to_street(addr['street_full'])
                logging.info('After append_TH_to_street: ' + street)
Exemple #13
0
def normalize_address_str(address_val):
    """
    Normalize the address to conform to short abbreviations.

    If an invalid address_val is provided, None is returned.

    If a valid address is provided, a normalized version is returned.
    """

    # if this string is empty the regular expression in the sa wont
    # like it, and fail, so leave returning nothing
    if not address_val:
        return None

    address_val = unicode(address_val).encode('utf-8')

    # Do some string replacements to remove odd characters that we come across
    replacements = {
        '\xef\xbf\xbd': '',
        '\uFFFD': '',
    }
    for k, v in replacements.items():
        address_val = address_val.replace(k, v)

    # now parse the address into number, street name and street type
    try:
        addr = usaddress.tag(str(address_val))[0]
    except usaddress.RepeatedLabelError:
        # usaddress can't parse this at all
        normalized_address = str(address_val)
    except UnicodeEncodeError:
        # Some kind of odd character issue that we aren't handling yet.
        normalized_address = str(address_val)
    else:
        # Address can be parsed, so let's format it.
        normalized_address = ''

        if 'AddressNumber' in addr and addr['AddressNumber'] is not None:
            normalized_address = _normalize_address_number(
                addr['AddressNumber'])

        if 'StreetNamePreDirectional' in addr and addr[
                'StreetNamePreDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePreDirectional'])  # NOQA

        if 'StreetName' in addr and addr['StreetName'] is not None:
            normalized_address = normalized_address + ' ' + addr['StreetName']

        if 'StreetNamePostType' in addr and addr[
                'StreetNamePostType'] is not None:
            # remove any periods from abbreviations
            normalized_address = normalized_address + ' ' + _normalize_address_post_type(
                addr['StreetNamePostType'])  # NOQA

        if 'StreetNamePostDirectional' in addr and addr[
                'StreetNamePostDirectional'] is not None:
            normalized_address = normalized_address + ' ' + _normalize_address_direction(
                addr['StreetNamePostDirectional'])  # NOQA

        formatter = StreetAddressFormatter()
        normalized_address = formatter.abbrev_street_avenue_etc(
            normalized_address)

    return normalized_address.lower().strip()
 def setUp(self):
     self.addr_parser = StreetAddressParser()
     self.addr_formatter = StreetAddressFormatter()