def _normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # now parse the address into number, street name and street type parser = StreetAddressParser() addr = parser.parse(str(address_val)) # TODO: should probably use unicode() normalized_address = '' if not addr: return None if 'house' in addr and addr['house'] is not None: normalized_address = addr['house'].lstrip("0") #some addresses have leading zeros, strip them here if 'street_name' in addr and addr['street_name'] is not None: normalized_address = normalized_address + ' ' + addr['street_name'] if 'street_type' in addr and addr['street_type'] is not None: normalized_address = normalized_address + ' ' + addr['street_type'] formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
def PreProcessing_records(dt): ''' return corect formatt ''' dt.Address = dt.Address.apply(lambda x: str(x).upper()) # uppercase dt.Address = dt.Address.str.rstrip() # Remove spaces dt_Address_ar = np.asarray(dt.Address) vfunc_TH = np.vectorize(USAddressFormatter_TH) # Custome formatter, correct wrong combination of number + TH dt_Address = vfunc_TH(dt_Address_ar) dt_Address_list = dt_Address.tolist() dt['Address'] = dt_Address_list dt['Address'] = dt.Address.str.rstrip() dt_Address_ar2 = np.asarray(dt.Address) vfunc_Abbr = np.vectorize(USAddressFormatter_Abbr) # Custom formatter, replace wrong abbreviation dt_Address2 = vfunc_Abbr(dt_Address_ar2) dt_Address2_list = dt_Address2.tolist() dt['Address'] = dt_Address2_list dt['Address'] = dt.Address.str.rstrip() dt_Address_ar3 = np.asarray(dt.Address) vfunc_Sym = np.vectorize(USAddressFormatter_Symbol) # Customer formatter, remove useless symbols dt_Address3 = vfunc_Sym(dt_Address_ar3) dt_Address3_list = dt_Address3.tolist() dt['Address'] = dt_Address3_list dt['Address'] = dt.Address.str.rstrip() abbr_formatter = StreetAddressFormatter() # load python package 'Formatter' for item in dt.Address: item = abbr_formatter.abbrev_direction(item) item = abbr_formatter.abbrev_street_avenue_etc(item) dt.Address = dt.Address.apply(lambda x: str(x).upper()) # uppercase again return dt boundary_list = []
class TestStreetAddress(unittest.TestCase): def setUp(self): self.addr_parser = StreetAddressParser() self.addr_formatter = StreetAddressFormatter() def test_success_abbrev_street_avenue_etc(self): addr = self.addr_parser.parse('221B Baker Street') eq_(self.addr_formatter.abbrev_street_avenue_etc(addr['street_full']), 'Baker St')
def _normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # now parse the address into number, street name and street type addr = usaddress.tag(str(address_val))[0] # TODO: should probably use unicode() normalized_address = '' if not addr: return None if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = addr['AddressNumber'].lstrip("0") # some addresses have leading zeros, strip them here if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePreDirectional']) if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type(addr['StreetNamePostType']) if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePostDirectional']) formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
def _normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # now parse the address into number, street name and street type parser = StreetAddressParser() addr = parser.parse( str(address_val)) # TODO: should probably use unicode() normalized_address = '' if not addr: return None if 'house' in addr and addr['house'] is not None: normalized_address = addr['house'].lstrip( "0") # some addresses have leading zeros, strip them here if 'street_name' in addr and addr['street_name'] is not None: normalized_address = normalized_address + ' ' + addr['street_name'] if 'street_type' in addr and addr['street_type'] is not None: normalized_address = normalized_address + ' ' + addr['street_type'] formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
def normalize_address_str(address_val, address_val_2, postal_code, extra_data): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # if this is a byte string, then convert to a string-string if isinstance(address_val, bytes): address_val = address_val.decode('utf-8') elif not isinstance(address_val, str): address_val = str(address_val) else: pass # Do some string replacements to remove odd characters that we come across replacements = { '\xef\xbf\xbd': '', '\uFFFD': '', } for k, v in replacements.items(): address_val = address_val.replace(k, v) # Remove lots, they are not part of a real address has_lot = re.split(',*\s[lL]ot\s', address_val) if has_lot: address_val = has_lot[0] # now parse the address into number, street name and street type try: # Add in the mapping of CornerOf to the AddressNumber. if address_val_2 and ('lot' not in address_val_2): addr = usaddress.tag(str(address_val + ' ' + address_val_2), tag_mapping={'CornerOf': 'AddressNumber'})[0] else: addr = usaddress.tag(str(address_val), tag_mapping={'CornerOf': 'AddressNumber'})[0] except usaddress.RepeatedLabelError: # usaddress can't parse this at all normalized_address = str(address_val) except UnicodeEncodeError: # Some kind of odd character issue that we are not handling yet. normalized_address = str(address_val) else: # Address can be parsed, so let's format it. normalized_address = '' street_name = '' extra_data['StreetNumber'] = extra_data['StreetName'] = extra_data[ 'StreetNamePreDirectional'] = extra_data[ 'StreetSuffix'] = extra_data['StreetDirSuffix'] = extra_data[ 'UnitNumber'] = '' if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = _normalize_address_number( addr['AddressNumber']) if 'AddressNumberSuffix' in addr and addr[ 'AddressNumberSuffix'] is not None: normalized_address = normalized_address + addr[ 'AddressNumberSuffix'] extra_data['StreetNumber'] = normalized_address if 'StreetNamePreDirectional' in addr and addr[ 'StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePreDirectional']) # NOQA extra_data['StreetDirPrefix'] = _normalize_address_direction( addr['StreetNamePreDirectional']) if 'StreetNamePreModifier' in addr and addr[ 'StreetNamePreModifier'] is not None: normalized_address = normalized_address + ' ' + addr[ 'StreetNamePreModifier'] street_name = street_name + addr['StreetNamePreModifier'] + ' ' if 'StreetNamePreType' in addr and addr[ 'StreetNamePreType'] is not None: normalized_address = normalized_address + ' ' + addr[ 'StreetNamePreType'] street_name = street_name + addr['StreetNamePreType'] + ' ' if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] street_name = street_name + addr['StreetName'] if 'StreetNamePostType' in addr and addr[ 'StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type( addr['StreetNamePostType']) # NOQA extra_data['StreetSuffix'] = _normalize_address_direction( addr['StreetNamePostType']) if 'StreetNamePostDirectional' in addr and addr[ 'StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePostDirectional']) # NOQA extra_data['StreetDirSuffix'] = _normalize_address_direction( addr['StreetNamePostDirectional']) if 'SubaddressType' in addr and addr['SubaddressType'] is not None: normalized_address = normalized_address + ' ' + _normalize_secondary_address( addr['SubaddressType']) if 'SubaddressIdentifier' in addr and addr[ 'SubaddressIdentifier'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_number( addr['SubaddressIdentifier']) if 'OccupancyType' in addr and addr['OccupancyType'] is not None: normalized_address = normalized_address + ' ' + _normalize_secondary_address( addr['OccupancyType']) if 'OccupancyIdentifier' in addr and addr[ 'OccupancyIdentifier'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_number( addr['OccupancyIdentifier']) extra_data['UnitNumber'] = _normalize_address_number( addr['OccupancyIdentifier']) formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc( normalized_address) normalized_address = normalized_address + ' ' + postal_code street_name = formatter.abbrev_street_avenue_etc(street_name) extra_data['StreetName'] = street_name return normalized_address.lower().strip(), extra_data
import pickle import requests #Imports for ParseAddress import usaddress from streetaddress import StreetAddressFormatter from nltk.tag.stanford import StanfordNERTagger as Tagger from geopy.geocoders import GoogleV3, Nominatim import nltk import geopy tagger = Tagger( 'stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2015-12-09/stanford-ner.jar') addr_formatter = StreetAddressFormatter() #a queue for storing processes class Queue: def __init__(self): self.internal_list = [] def put(self, data): self.internal_list.append(data) def get(self): if self.internal_list != []: data = self.internal_list[0] del self.internal_list[0] return data
def normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None address_val = unicode(address_val).encode('utf-8') # Do some string replacements to remove odd characters that we come across replacements = { '\xef\xbf\xbd': '', '\uFFFD': '', } for k, v in replacements.items(): address_val = address_val.replace(k, v) # now parse the address into number, street name and street type try: # Add in the mapping of CornerOf to the AddressNumber. addr = usaddress.tag(str(address_val), tag_mapping={'CornerOf': 'AddressNumber'})[0] except usaddress.RepeatedLabelError: # usaddress can't parse this at all normalized_address = str(address_val) except UnicodeEncodeError: # Some kind of odd character issue that we are not handling yet. normalized_address = str(address_val) else: # Address can be parsed, so let's format it. normalized_address = '' if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = _normalize_address_number( addr['AddressNumber']) if 'StreetNamePreDirectional' in addr and addr[ 'StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePreDirectional']) # NOQA if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr[ 'StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type( addr['StreetNamePostType']) # NOQA if 'StreetNamePostDirectional' in addr and addr[ 'StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePostDirectional']) # NOQA if 'OccupancyType' in addr and addr['OccupancyType'] is not None: normalized_address = normalized_address + ' ' + addr[ 'OccupancyType'] if 'OccupancyIdentifier' in addr and addr[ 'OccupancyIdentifier'] is not None: normalized_address = normalized_address + ' ' + addr[ 'OccupancyIdentifier'] formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc( normalized_address) return normalized_address.lower().strip()
def setUp(self): self.addr_parser = StreetAddressParser() self.addr_formatter = StreetAddressFormatter()
2081 N. Webb Rd 1515 West 22nd Street 2029 Stierlin Court P.O. Box 33170 The Landmark @ One Market, Suite 200 One Market, Suite 200 One Market One Union Square One Union Square, Apt 22-C 186 Avenue A 10 Avenue of America 25 West St """.split("\n") addr_parser = StreetAddressParser() addr_formatter = StreetAddressFormatter() if opts.addr: lst = [opts.addr] else: lst = map(str.strip, tests) for t in lst: if t: print '"%s"' % t logging.info('addr_str: ' + unicode(t)) addr = addr_parser.parse(t) if addr['street_full'] is not None: street = addr_formatter.append_TH_to_street( addr['street_full'])
2081 N. Webb Rd 1515 West 22nd Street 2029 Stierlin Court P.O. Box 33170 The Landmark @ One Market, Suite 200 One Market, Suite 200 One Market One Union Square One Union Square, Apt 22-C 186 Avenue A 10 Avenue of America 25 West St """.split("\n") addr_parser = StreetAddressParser() addr_formatter = StreetAddressFormatter() if opts.addr: lst = [opts.addr] else: lst = map(str.strip,tests) for t in lst: if t: print '"%s"' % t logging.info('addr_str: ' + unicode(t)) addr = addr_parser.parse(t) if addr['street_full'] is not None: street = addr_formatter.append_TH_to_street(addr['street_full']) logging.info('After append_TH_to_street: ' + street)
def normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None address_val = unicode(address_val).encode('utf-8') # Do some string replacements to remove odd characters that we come across replacements = { '\xef\xbf\xbd': '', '\uFFFD': '', } for k, v in replacements.items(): address_val = address_val.replace(k, v) # now parse the address into number, street name and street type try: addr = usaddress.tag(str(address_val))[0] except usaddress.RepeatedLabelError: # usaddress can't parse this at all normalized_address = str(address_val) except UnicodeEncodeError: # Some kind of odd character issue that we aren't handling yet. normalized_address = str(address_val) else: # Address can be parsed, so let's format it. normalized_address = '' if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = _normalize_address_number( addr['AddressNumber']) if 'StreetNamePreDirectional' in addr and addr[ 'StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePreDirectional']) # NOQA if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr[ 'StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type( addr['StreetNamePostType']) # NOQA if 'StreetNamePostDirectional' in addr and addr[ 'StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction( addr['StreetNamePostDirectional']) # NOQA formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc( normalized_address) return normalized_address.lower().strip()