def compare_addresses(address1, address2): try: a1 = usaddress.tag(address1) a2 = usaddress.tag(address2) shared = dict(set(a1[0].items()) & set(a2[0].items())) return shared except RepeatedLabelError: pass
def parse(self, address): # TODO: Implement this method to return the parsed components of a # given address using usaddress: https://github.com/datamade/usaddress # this should receieve an address from the GET request, parse the address, and return JSON using serializer usaddress.tag(address) #and then serialize? return address_components, address_type
def get_address(self, text_block, key): block_string = ' '.join(text_block).lower() po_pattern = re.compile(r'(po box)\s*\d+') po_box = re.search(po_pattern, block_string) if po_box != None: self.pat_dic[key + '_' + 'po_box'] = po_box[0].split()[-1] # add_pattern = re.compile(r'([A-Z,a-z,0-9][^.!\-:;,\s]+)[,|\s]+([A-Z,a-z][^.!\-:;]+?)\s*(\d{5})') add_pattern = re.compile( r'([A-Z,a-z,0-9][^!\-:;,]+)[,|\s]+([A-Z,a-z][^.!\-:;]+?)\s*(\d{5})' ) addresses = [] for line in text_block: addresses.append(re.findall(add_pattern, line.lower())) print(addresses) for matches in addresses: if len(matches) > 0: try: tags = usaddress.tag(' '.join(matches[0]).replace('.', ''))[0] if 'PlaceName' in tags.keys() and 'StateName' in tags.keys( ) and tags['StateName'].upper() in US_STATES: self.pat_dic[key + '_' + 'address'] = ' '.join( matches[0]).replace('.', '') self.pat_dic[key + '_' + 'PlaceName'] = tags['PlaceName'] self.pat_dic[key + '_' + 'StateName'] = tags['StateName'] self.pat_dic[key + '_' + 'ZipCode'] = tags['ZipCode'] except: print("Unexpected error:", sys.exc_info()[0]) for matches in text_block: if len(matches) > 0: try: main_tags = usaddress.tag(matches.lower()) tags = main_tags[0] if len(main_tags) > 0: if "StreetName" in tags.keys( ) and "AddressNumber" in tags.keys( ) and main_tags[1] == 'Street Address' and ( 'SubaddressType' not in tags.keys() and 'Recipient' not in tags.keys()): if tags["AddressNumber"].isdigit(): print(tags) self.pat_dic[key + '_' + 'street'] = matches.lower() except: print("Unexpected error:", sys.exc_info()[0])
def audit_address(address): """ Checks for valid street types and other address issues""" if 'housenumber' in address: num = address['housenumber'] addy = usaddress.tag(num) if 'AddressNumber' in addy[0]: address['housenumber'] = addy[0]['AddressNumber'] if 'street' in address and 'StreetNamePreDirectional' in addy[0]: address['street'] = str.format('{} {}', addy[0]['StreetNamePreDirectional'], address['street']) if 'street' in address: street = address['street'] try: addy = usaddress.tag(street) except Exception: # Nothing to parse, just ignore it pass if 'addresscalc' in locals(): if 'housenumber' not in address and 'AddressNumber' in addy[0]: address['housenumber'] = addy[0]['AddressNumber'] # Remove Street Number from Name direction = addy[0][ 'StreetNamePreDirectional'] if 'StreetNamePreDirectional' in addy[ 0] else '' streetname = addy[0]['StreetName'] if 'StreetName' in addy[ 0] else '' streettype = addy[0][ 'StreetNamePostType'] if 'StreetNamePostType' in addy[0] else '' street = str.format('{} {} {}', direction, streetname, streettype).strip() # Check that any common abbreviations are spelled out pat = re.compile(r'\b(' + '|'.join(STREET_FIXES.keys()) + r')\b') street = pat.sub(lambda x: STREET_FIXES[x.group()], street) # Update directional abbreviations to full words pat = re.compile(r'\b(' + '|'.join(DIRECTION_FIXES.keys()) + r')\b') street = pat.sub(lambda x: DIRECTION_FIXES[x.group()], street) address['street'] = street return address
def format_address_data(address_data, county_name): mapping = electionsaver.addressSchemaMapping boe_county = ["Clay", "Jackson", "Platte", "St. Louis"] boe_city = ["Kansas City", "St. Louis City"] if county_name in boe_county: location_name = f"{county_name} County Board of Elections" elif county_name in boe_city: location_name = f"{county_name} Board of Elections" elif county_name == "St. Charles": location_name = "St. Charles Country Election Authority" else: location_name = f"{county_name} County Election Office" parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0] final_address = {"locationName": location_name} if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"] if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict[ "streetNumberName"] if "locationName" in parsed_data_dict: final_address["locationName"] = parsed_data_dict["locationName"] if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"] return final_address
def ZipCode(x): try: data = usaddress.tag(x) if 'ZipCode' in data[0].keys(): return data[0]['ZipCode'] except: pass
def addresscheck(df,usadd,wrongaddress): i = 0 j = 0 for index, row in df.iterrows(): company = unicode(row['Company']).encode('utf-8') addressstring = (unicode(row['Address 1']).encode('utf-8')+' '+ unicode(row['Address 2']).encode('utf-8')+' ' + unicode(row['City']).encode('utf-8')+' ' + unicode(row['State']).encode('utf-8') + ' ' + unicode(row['Zipcode']).encode('utf-8')) try: addresscheck = usaddress.tag(addressstring) except usaddress.RepeatedLabelError: wrongaddress.loc[j] = df.iloc[index] j = j+1 if addresscheck[1] != 'Ambiguous': if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1 and company.find('University at Buffalo') == -1: usadd.loc[i]= df.iloc[index] i = i +1 else: if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1 and company.find('University at Buffalo') == -1: wrongaddress.loc[j] = df.iloc[index] j = j +1
def _check_address(address: Any, must_contain: Tuple[str, ...], clean: bool) -> Any: """ Finds the index of the given country in the DATA dataframe. Parameters ---------- address_str address value to be cleaned must_contain A tuple containing parts of the address that must be included for the address to be successfully cleaned clean If True, a tuple (index, status) is returned. If False, the function returns True/False to be used by the validate address function. """ if address in NULL_VALUES: return (None, "null") if clean else False address = re.sub(r"[().]", "", str(address)) try: address, _ = usaddress.tag(address, TAG_MAPPING) except usaddress.RepeatedLabelError: return (None, "unknown") if clean else False status = _check_status(address, must_contain) if status: return (address, "success") if clean else True return (address, "unknown") if clean else False
def format_address_data(address, county_name): mapping = electionsaver.addressSchemaMapping if county_name == "Knox": address = "100 W Cedar St, Benjamin, TX 79505" if county_name == "Live Oak": address = "301 E Houston St George West, TX 78022" if county_name == "Kleberg": address = address + " 78364" if county_name == "Parker": address = "1112 Santa Fe Drive Weatherford, TX 76086" if county_name == "Stephens": address = address.replace("Courthouse", "") if county_name == "Borden": address = "117 Wasson Rd, Gail, TX 79738" parsed_data_dict = usaddress.tag(address, tag_mapping=mapping)[0] final_address = {"state": "Texas"} if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict["streetNumberName"] if "city" in parsed_data_dict: final_address["city"] = parsed_data_dict["city"] if "zipCode" in parsed_data_dict: final_address["zipCode"] = parsed_data_dict["zipCode"] if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"] if "locationName" in parsed_data_dict: final_address["locationName"] = parsed_data_dict["locationName"] if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"] return final_address
def format_address_data(address_data, county_name): if county_name == "Swans Island": address_data = address_data.replace("Swan'S", "Swans") if county_name == "Jackson": address_data = "730 Moosehead Trail Hwy PO Box 393 Jackson, ME 04921" final_address = {} mapping = electionsaver.addressSchemaMapping parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0] try: final_address = { "city": parsed_data_dict["city"], "state": "Maine", "zipCode": parsed_data_dict["zipCode"], } except KeyError: print(f"Error with data {parsed_data_dict}") if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict[ "streetNumberName"] if "locationName" in parsed_data_dict: final_address["locationName"] = parsed_data_dict["locationName"] if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"] if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"] return final_address
def format_address_data(address_data, county_name): mapping = electionsaver.addressSchemaMapping location_name = f"{county_name} County Election Office" # https://www.daggettcounty.org/16/ClerkTreasurer if county_name == 'Daggett': address_data = "95 North 1st West, P.O. Box 400" # http://sanjuancounty.org/index.php/clerkauditor/ elif county_name == 'San Juan': address_data = "117 South Main, P.O. Box 338" parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0] final_address = {"locationName": location_name} if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"] if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict[ "streetNumberName"] if "locationName" in parsed_data_dict: final_address["locationName"] = parsed_data_dict["locationName"] if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"] return final_address
def extract_mailing_address(self, input_dict): if input_dict['MAILADD1'].strip(): try: tagged_address, address_type = usaddress.tag(' '.join([ input_dict['MAILADD1'], input_dict['MAILADD2'], input_dict['MAILADD3'], input_dict['MAILADD4'] ])) if address_type == 'Ambiguous': print("Warn - %s: Ambiguous mailing address falling back to residential (%s)" % address_type, input_dict['MAILADD1']) tagged_address = {} if len(tagged_address) > 0: return { 'MAIL_ADDRESS_LINE1': self.construct_mail_address_1( tagged_address, address_type, ), 'MAIL_ADDRESS_LINE2': self.construct_mail_address_2(tagged_address), 'MAIL_CITY': tagged_address['PlaceName'] if 'PlaceName' in tagged_address else "", 'MAIL_ZIP_CODE': tagged_address['ZipCode'] if 'ZipCode' in tagged_address else "", 'MAIL_STATE': tagged_address['StateName'] if 'StateName' in tagged_address else "", 'MAIL_COUNTRY': "" } else: return {} except usaddress.RepeatedLabelError as e: print('Warn: Can\'t parse mailing address. Falling back to residential (%s)' % e.parsed_string) return {} else: return {}
def populate_address(self, sosrec): address = sosrec['Address'].replace('<br/>', ' ') addr_parts = usaddress.tag(address) payload = { 'addr': "", 'unit': "", 'city': "", 'state': "KANSAS", 'zip': "" } for key, val in addr_parts[0].items(): if key == 'OccupancyIdentifier': payload['unit'] = val elif key == 'PlaceName': payload['city'] = val elif key == 'StateName' and len(val) > 0: payload['state'] = val elif key == 'ZipCode': payload['zip'] = val else: if len(payload['addr']) > 0: payload['addr'] = ' '.join([payload['addr'], val]) elif val == "No information available": payload['addr'] = "" else: payload['addr'] = val self.update(payload)
def format_address_data(address_data, town_name): mapping = electionsaver.addressSchemaMapping # Edge cases if address_data == "20 PARK ST GORHAM": address_data = "20 PARK ST GORHAM 03581" parsed_data_dict = {} try: parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0] except Exception as e: raise WalkTheVoteError( f"Error with data for {town_name} town, data is {parsed_data_dict}" ) from e final_address = {"state": "NH"} if "city" in parsed_data_dict: final_address["city"] = parsed_data_dict["city"].title() if "zipCode" in parsed_data_dict: final_address["zipCode"] = parsed_data_dict["zipCode"] if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict[ "streetNumberName"].title() if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"].title() final_address["locationName"] = parsed_data_dict.get( "locationName", f"{town_name} City Election Office".title()) if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"].title() return final_address
def parse_address(self, input_str): address = usaddress.tag(input_str) print json.dumps(address, indent=4) print type(address) # compose street street = '' if 'OccupancyType' in address[0].keys( ) and 'OccupancyIdentifier' in address[0].keys(): street = '%s %s, ' % (address[0].get('OccupancyType'), address[0].get('OccupancyIdentifier')) if not address[0].get('AddressNumber') or not address[0].get( 'StreetName'): street = None elif not address[0].get('StreetNamePostType'): street = street + '%s %s' % (address[0].get('AddressNumber'), address[0].get('StreetName')) else: street = street + '%s %s %s' % ( address[0].get('AddressNumber'), address[0].get('StreetName'), address[0].get('StreetNamePostType')) result = { 'city': address[0].get('PlaceName'), 'country': address[0].get('CountryName'), 'state': address[0].get('StateName'), 'street': street, 'zip': address[0].get('ZipCode') } return result
def geo_parser(location, gmaps_json): # parse json response try: results = gmaps_json["results"][0] std_name = results['name'] print(std_name) lat = results['geometry']['location']['lat'] lng = results['geometry']['location']['lng'] std_address = results['formatted_address'] # parse address try: parsed_address = usaddress.tag(std_address) city = parsed_address[0]['PlaceName'] state = parsed_address[0]['StateName'] except: parsed_address = usaddress.parse(std_address) # traverse parsed address list if the tagger fails city = '' state = '' for addr_tup in parsed_address: print(addr_tup) if addr_tup[1] == 'PlaceName': city += ' ' + addr_tup[0] if addr_tup[1] == 'StateName': state += ' ' + addr_tup[0] city = city.strip() print(city) df = pd.DataFrame([[location, std_name, lat, lng, city, state]], columns=['Raw_Name', 'Name', 'Latitude', 'Longitude', 'City', 'State']) return df except IndexError: print(gmaps_json) df = pd.DataFrame() return
def usaddress_tag(self, address_str): """ We get parse misses now and then. TODO: figure out how to handle usaddress parse errors. The usaddress_type almost always comes back as a "Ambiguous" We use a simple convention of if there's a USPSBoxID, then it's a PO Box, otherwise it's a Street Address Input: address_str: string of address from input file Output: Dictionary containing tagged parts of addresses """ try: usaddress_dict, usaddress_type = usaddress.tag(address_str) # if contains a PO Box ID consider it a PO Box, else a Street Address if 'USPSBoxID' in usaddress_dict: usaddress_type = 'PO Box' else: usaddress_type = 'Street Address' return usaddress_dict, usaddress_type except usaddress.RepeatedLabelError as e: # If USAddress fails then just return None to set the # VALIDATION_STATUS appropriatly. We will have to manually fix the address later return None, None
def parse(self, address): address, address_type = usaddress.tag( address, tag_mapping={ 'Recipient': 'recipient', 'AddressNumber': 'address1', 'AddressNumberPrefix': 'address1', 'AddressNumberSuffix': 'address1', 'StreetName': 'address1', 'StreetNamePreDirectional': 'address1', 'StreetNamePreModifier': 'address1', 'StreetNamePreType': 'address1', 'StreetNamePostDirectional': 'address1', 'StreetNamePostModifier': 'address1', 'StreetNamePostType': 'address1', 'CornerOf': 'address1', 'IntersectionSeparator': 'address1', 'LandmarkName': 'address1', 'USPSBoxGroupID': 'address1', 'USPSBoxGroupType': 'address1', 'USPSBoxID': 'address1', 'USPSBoxType': 'address1', 'OccupancyType': 'address1', 'OccupancyIdentifier': 'address1', 'SubaddressIdentifier': 'address1', 'SubaddressType': 'address1', 'BuildingName': 'address2', 'PlaceName': 'city', 'StateName': 'state', 'ZipCode': 'zip_code', 'CountryName': 'country' }) return address
def clean_street(address1, address2="", *, zipcode=None, strip_occupancy=False): """ Clean street address. If a zipcode is passed in, we double check to make sure the address was parsed correctly. If strip_occupancy is true, remove components that don't affect geocoding, like floor, apartment, unit, etc. NOTE: `usaddress` was trained with full addresses, not just street address. """ # matchers that rely on knowing address lines # Strip "care of" lines if re.match(r"c/o ", address1, re.IGNORECASE): return address2 # concatenate line 1 and line 2 address = "{} {}".format(address1, address2).strip() if not address: # XXX Should this be an exception? return address try: # Add arbitrary city/state/zip to get `usaddress` to parse the address # as just the street address and not a full address address_to_parse = ("{}, Austin, TX {}".format(address, zipcode) if zipcode else address) addr, type_ = usaddress.tag(address_to_parse) except usaddress.RepeatedLabelError: logger.warning("Unparseable address: {}".format(address_to_parse)) return address if zipcode: try: addr.pop("PlaceName", None) addr.pop("StateName", None) guessed_zip = addr.pop("ZipCode") assert guessed_zip == zipcode except KeyError: logger.warning("Expected zipcode %s but found none", zipcode) except AssertionError: logger.warning("Guessed the wrong zipcode {} != {}".format( guessed_zip, zipcode)) if strip_occupancy: addr.pop("OccupancyType", None) addr.pop("OccupancyIdentifier", None) if type_ == "Street Address": return " ".join( [component_format(label, value) for label, value in addr.items()]) elif type_ == "PO Box": return " ".join( [component_format(label, value) for label, value in addr.items()]) logger.warning("Ambiguous address: {}".format(address), extra=addr) return address
def format_address_data(address_data, county_name): mapping = electionsaver.addressSchemaMapping parsed_data_dict = {} try: parsed_data_dict = usaddress.tag(address_data, tag_mapping=mapping)[0] except Exception as e: raise WalkTheVoteError( f"Error with data for {county_name} town, data is {parsed_data_dict}" ) from e final_address = {"state": "SC"} if "city" in parsed_data_dict: final_address["city"] = parsed_data_dict["city"].title() if "zipCode" in parsed_data_dict: final_address["zipCode"] = parsed_data_dict["zipCode"] if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict[ "streetNumberName"].title() if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"].title() final_address["locationName"] = parsed_data_dict.get( "locationName", f"{county_name} County Board of Voter Registration & Elections".title( )) if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"].title() return final_address
def test_api_parse_succeeds(client): # TODO: Finish this test. Send a request to the API and confirm that the # data comes back in the appropriate format. address_string = '123 main st chicago il' parsed_address = usaddress.tag(address_string) assert type(parsed_address) is tuple
def search_by_address(address): """ search_by_address(address) -> [address_candidates] Returns an array of properties matching the given address param: address (String): The address associated with the property """ address = usaddress.tag(address) street = ' '.join([ address[0].get('AddressNumber', ''), address[0].get('StreetNamePreDirectional', ''), address[0].get('StreetName', ''), address[0].get('StreetNamePostType', '') ]) city = address[0].get('PlaceName', '') state = address[0].get('StateName', '') zip_code = address[0].get('ZipCode', '') payload = (('Street', street), ('City', city), ('State', state), ('ZIP', zip_code), ('f', 'json')) if address[1] == 'Street Address': # TODO: asynchronous? r = requests.post(BASE_URL, params=payload) candidates = [c for c in r.json()['candidates']] # Sort by 'score' candidates.sort(key=_sort_by_score, reverse=True) return candidates else: return []
def usaddress_tag(self, address_str): """ We get parse misses now and then. TODO: figure out how to handle usaddress parse errors. The usaddress_type almost always comes back as a "Ambiguous" We use a simple convention of if there's a USPSBoxID, then it's a PO Box, otherwise it's a Street Address Input: address_str: string of address from input file Output: Dictionary containing tagged parts of addresses """ try: usaddress_dict, usaddress_type = usaddress.tag(address_str) except usaddress.RepeatedLabelError as e: # this will use the second occurance of each tag # there is probably a better rule usaddress_dict = {k: v for v, k in e.parsed_string} # if contains a PO Box ID consider it a PO Box, else a Street Address if 'USPSBoxID' in usaddress_dict: usaddress_type = 'PO Box' else: usaddress_type = 'Street Address' return usaddress_dict, usaddress_type
def format_street_number(street_number_name, county_name): mapping = electionsaver.addressSchemaMapping if county_name == "Glenn": street_number_name = street_number_name.replace(", 2nd Street", "") parsed_data_dict = usaddress.tag(street_number_name, tag_mapping=mapping)[0] final_address = {"locationName": f"{county_name} County Election Office"} if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"] if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict["streetNumberName"] if "locationName" in parsed_data_dict: final_address["locationName"] = parsed_data_dict["locationName"] if county_name == "San Francisco": final_address = { "streetNumberName": "1 Dr. Carlton B Goodlett Place", "locationName": "City Hall", "aptNumber": "Room 48", } # print(f'Error with data for {countyName} county, data is {parsed_data_dict}') return final_address
def format_address_data(address, county_name): mapping = electionsaver.addressSchemaMapping parsed_data_dict = usaddress.tag(address, tag_mapping=mapping)[0] final_address = { "state": "Ohio", "zipCode": parsed_data_dict["zipCode"], } if "streetNumberName" in parsed_data_dict: final_address["streetNumberName"] = parsed_data_dict[ "streetNumberName"] else: if county_name == "Vinton": final_address["streetNumberName"] = "31935 OH-93" if county_name == "Brown": final_address["streetNumberName"] = "800 Mt. Orab Pike" if "city" in parsed_data_dict: final_address["city"] = parsed_data_dict["city"] if "poBox" in parsed_data_dict: final_address["poBox"] = parsed_data_dict["poBox"] if "locationName" in parsed_data_dict: final_address["locationName"] = parsed_data_dict["locationName"] if county_name == "Vinton": final_address["locationName"] = "Community Building" if county_name == "Brown": final_address["locationName"] = "Administrative Building" if "aptNumber" in parsed_data_dict: final_address["aptNumber"] = parsed_data_dict["aptNumber"] if county_name == "Vinton": final_address.pop("aptNumber") return final_address
def standardize(address, code = "a"): if code not in ["a", "r", "n"]: raise InputError("code must be a (append), r (replace), or n (none)") # make case insensitive, apply usaddress parsing tagged = usaddress.tag(str(address).upper(), label_mappings) tagged = tagged[0] # remove punctuation from results (not removed beforehand, as punctuation can affect parsing) stripped = {label: words if label == 'HN' else \ words.translate(str.maketrans('', '', string.punctuation)).strip() \ for (label, words) in tagged.items()} # apply replacements substituted = clean(stripped, processing_dict) # add codes for directions, extensions, etc. if desired if code != "n": pairs = list(substituted.items()) for (label, word) in pairs: # confirm label is substitutable and substitution is known if label in code_dict and word in code_dict.get(label): # add to dictionary substituted[label+"C"] = code_dict[label].get(word) # remove original value if requested if code == "r": substituted.pop(label) # add concatenated WSN if "WSDESC1" in substituted or "WSID1" in substituted: if "WSDESC1" not in substituted: substituted["WS"] = substituted["WSID1"] elif "WSID1" not in substituted: substituted["WS"] = substituted["WSDESC1"] else: substituted["WS"] =" ".join([substituted["WSDESC1"], substituted["WSID1"]]) return ' '.join([str(substituted[key]) for key in substituted.keys()])
def appendMailingAddress(outrow, row): try: tagged_address, address_type = usaddress.tag(' '.join([ row['MAILADD1'], row['MAILADD2'], row['MAILADD3'], row['MAILADD4'] ])) except usaddress.RepeatedLabelError as e: print('Can\'t parse mailing address. Falling back to res') tagged_address = {} if (len(tagged_address) > 0): PrepareUtils.appendMailingAddressFromTaggedFields( outrow, tagged_address, address_type) else: outrow.update({ 'MAIL_ADDRESS_LINE1': PrepareUtils.constructMailAddr1FromOutRow(outrow), 'MAIL_ADDRESS_LINE2': PrepareUtils.constructMailAddr2FromOutRow(outrow), 'MAIL_CITY': outrow['PLACE_NAME'], 'MAIL_STATE': outrow['STATE_NAME'], 'MAIL_ZIP_CODE': outrow['ZIP_CODE'], 'MAIL_COUNTRY': 'USA' })
def getparts(self): try: tagged_address, self._type = usaddress.tag(self.prepped) except usaddress.RepeatedLabelError as e: x = OrderedDict(e.parsed_string) tagged_address, self._type = (x, "Questionable") return tagged_address
def parse_address_string(addr_str): # type: (str) -> MutableMapping[str, str] """Separate an address string into its component parts per usaddress. Attempts to parse addr_str into it's component parts, using usaddress. If usaddress identifies the address type as Ambiguous or the resulting OrderedDict includes any keys from AMBIGUOUS_LABELS that would constitute ambiguous address in the SEED/GBR use case (ie: Recipient) then an AmbiguousAddressError is raised. :param addr_str: str address to be processed. :type addr_str: str :return: usaddress OrderedDict :rtype: MutableMapping """ parsed_results = usaddress.tag(addr_str) parsed_addr = parsed_results[0] # if the address is parseable but some form of ambiguity is found that # may result in data corruption NormalizationError is raised. if (parsed_results[1] == 'Ambiguous' or any(key in AMBIGUOUS_LABELS for key in parsed_addr.keys())): raise AmbiguousAddressError() parsed_addr = handle_abnormal_occupancy(parsed_addr, addr_str) return parsed_addr
def get(self, request): query = request.GET.get('address') addressComponents = usaddress.tag(query) print(addressComponents) # This will call the parse method when request is sent from the index.html form return Response(addressComponents)
def extract_mailing_address(self, input_dict): if input_dict['MAIL_ADDR1'].strip(): try: tagged_address, address_type = usaddress.tag(' '.join([ input_dict['MAIL_ADDR1'], input_dict['MAIL_ADDR2'], input_dict['MAIL_ADDR3'] ])) if address_type == 'Ambiguous': print("Warn - {}: Ambiguous mailing address falling back to residential ({})".format(address_type, input_dict['MAIL_ADDR1'])) tagged_address = {} if len(tagged_address) > 0: return { 'MAIL_ADDRESS_LINE1': self.construct_mail_address_1( tagged_address, address_type, ), 'MAIL_ADDRESS_LINE2': self.construct_mail_address_2(tagged_address), 'MAIL_CITY': tagged_address['PlaceName'] if 'PlaceName' in tagged_address else "", 'MAIL_ZIP_CODE': tagged_address['ZipCode'] if 'ZipCode' in tagged_address else "", 'MAIL_STATE': tagged_address['StateName'] if 'StateName' in tagged_address else "", 'MAIL_COUNTRY': "" } else: return {} except usaddress.RepeatedLabelError as e: print('Warn: Can\'t parse mailing address. Falling back to residential ({})'.format(e.parsed_string)) return {} else: return {}
def label_parsing(address): try: # make case insensitive, apply usaddress parsing tagged = tag(address.upper(), label_mappings) # exit if parsing fails except RepeatedLabelError: return {'ERROR' : address} tagged = tagged[0] # remove punctuation from results (not removed beforehand, as punctuation can affect parsing) stripped = {label: words if label == 'HN' else \ words.translate(str.maketrans('', '', string.punctuation)).strip() \ for (label, words) in tagged.items()} if 'HN' in stripped: HN = stripped['HN'] separator = "".join([x for x in HN if not x.isnumeric()]) if separator: parts = HN.split(separator) if len(parts) == 2: stripped["HN1"] = parts[0] stripped["HNSEP"] = separator stripped["HN2"] = parts[1] # add concatenated WSN if "WSDESC1" in stripped or "WSID1" in stripped: if "WSDESC1" not in stripped: stripped["WS"] = stripped["WSID1"] elif "WSID1" not in stripped: stripped["WS"] = stripped["WSDESC1"] else: stripped["WS"] =" ".join([stripped["WSDESC1"], stripped["WSID1"]]) return stripped
def process_row(row, commit=False): address_number = row[1] street_name = row[2] description = row[5].strip() resolution = row[6].strip() closed = row[9] if commit: closed_status, _ = CaseStatus.objects.get_or_create(name='Closed') active_status, _ = CaseStatus.objects.get_or_create(name='Active') address = "{} {}".format(address_number, street_name) tagged = usaddress.tag(address) address_type = tagged[1] if address_type == 'Street Address': address_number = tagged[0].get('AddressNumber') street_name = tagged[0].get('StreetName') try: address_number = int(address_number) except ValueError: # malformed address_number return if address_number and street_name: CSSCase.objects.get_or_create( description=description, resolution=resolution, status=closed and closed_status or active_status, address_number=address_number, street_name=street_name.upper() )
def formatAddressData(address, countyName): mapping = electionsaver.addressSchemaMapping parsedDataDict = usaddress.tag(address, tag_mapping=mapping)[0] finalAddress = { "state": "Illinois", "zipCode": parsedDataDict["zipCode"], } if "streetNumberName" in parsedDataDict: finalAddress["streetNumberName"] = parsedDataDict["streetNumberName"].title() else: if countyName == "Cumberland": finalAddress["streetNumberName"] = "140 COURTHOUSE SQUARE".title() if countyName == "Mason": finalAddress["streetNumberName"] = "100 NORTH BROADWAY".title() if "city" in parsedDataDict: finalAddress["city"] = parsedDataDict["city"].title() if countyName == "Mason": finalAddress["city"] = "HAVANA".title() if "poBox" in parsedDataDict: finalAddress["poBox"] = parsedDataDict["poBox"] if countyName == "Cumberland": finalAddress["poBox"] = 'PO BOX 146'.title() if countyName == "Mason": finalAddress["poBox"] = "PO BOX 77".title() if "locationName" in parsedDataDict: finalAddress["locationName"] = parsedDataDict["locationName"] if "aptNumber" in parsedDataDict: finalAddress["aptNumber"] = parsedDataDict["aptNumber"].title() return finalAddress
def lookup_geo(g, ady, verbose=False): if verbose: print 'Lookup_geo:\n\t%s' % ady tags, _ = usaddress.tag(ady) addressNumber = tags.get('AddressNumber', '') streetName = [v for k, v in tags.items() if k.startswith('StreetName')] streetName = ' '.join(streetName) borough = tags.get('PlaceName', '').lower() if 'ny' in borough or 'manhattan' in borough: borough = 'manhattan' if 'queens' in borough: borough = 'queens' if 'brooklyn' in borough: borough = 'brooklyn' if 'bronx' in borough: borough = 'bronx' if 'manhattan' in borough: borough = 'manhattan' if 'staten island' in borough: borough = 'staten island' if verbose: print usaddress.tag(ady) print 'adNumber: %s\t\tstName: %s\t\tBorough:%s\n\n' % (addressNumber, streetName, borough) dic = g.address(addressNumber, streetName, borough) zipcode = dic.get('zipCode', '') streetAddress = '%s %s' % (dic.get('houseNumber', ''), dic.get('firstStreetNameNormalized', '')) borough = dic.get('firstBoroughName', '') longitude = dic.get('longitude', '') latitude = dic.get('latitude', '') place = RefLocation(streetAddress, borough, zipcode, latitude, longitude) return place.schema_object()
def searchParse(query, filters): query = str(query) query = string.capwords(query) parsed = usaddress.tag(query) # returns tuple with parsed string and 'street address' or 'ambiguous' if parsed[1] == 'Street Address': return redirect(url_for('.report', address=query)) else: return redirect(url_for('.listings', address=query, filters=filters))
def tag_address(address): """Tags address by its individual components.""" tagged_address = None try: tagged_address = usaddress.tag(address) except (usaddress.RepeatedLabelError, UnicodeEncodeError): pass return tagged_address
def get_region(self): if(self.incd_address): try: tag = usaddress.tag(self.incd_address)[0] if(tag.has_key('PlaceName')): places = tag['PlaceName'].split(",") if(len(places) >= 1): self.region = places[len(places) - 1].strip() except usaddress.RepeatedLabelError: self.region = None
def generateListing(query): lis = Listing.query.filter_by(raw_add=query).first() if lis: return lis # not currently in db, note not robust parsed = usaddress.tag(query) add_Dict = parsed[0] state = city = zipcode = "Unknown" if "StateName" in add_Dict: state = add_Dict["StateName"] if "PlaceName" in add_Dict: city = add_Dict["PlaceName"] if "ZipCode" in add_Dict: zipcode = add_Dict["ZipCode"] street_address = "" if "AddressNumber" in add_Dict: street_address += add_Dict["AddressNumber"] if "StreetName" in add_Dict: street_address += " " street_address += add_Dict["StreetName"] if "StreetNamePostType" in add_Dict: street_address += " " street_address += add_Dict["StreetNamePostType"] hashed = getHash(query) lis = Listing( raw_add=query, street_address=street_address, state=state, city=city, zipcode=zipcode, area=getArea(hashed), price=getPrice(hashed), bedrooms=getRm(hashed), bathrooms=(getRm(hashed) - 1), realtor=getRealtor(hashed), seller=getSeller(hashed), school_district=getSchoolDis(hashed), ) db.session.add(lis) # populate tabs tax = Tax(rate=getTaxRate(hashed), listing=lis) schools = School(elementarySchool=getElemSchool(hashed), highSchool=getHighSchool(hashed), listing=lis) crim = Crime(rate=getCrimeRate(hashed), most_frequent_crime=getFrequentCrime(hashed), listing=lis) geo = Geo(most_frequent_incident=getGeoIncident(hashed), most_recent_incident=getDate(hashed), listing=lis) img = Image(path=getHouseImage(hashed), listing=lis) db.session.add(img) db.session.add(tax) db.session.add(schools) db.session.add(crim) db.session.add(geo) db.session.commit() return lis
def _parse(address_string): """parses address string into atx address parts, returns list """ if not type(address_string) is str: raise TypeError(Messages.str_req) address_string = _sanitize(address_string) address_string = _pre_hack(address_string) address_parts = usaddress.tag(address_string) address_parts = _translate_to_atx(address_parts) address_parts = _post_hack(address_parts) return address_parts
def addresscheck(df): for index, row in df.iterrows(): addressstring = (row[1]+' '+ row[2]+' ' + row[4]+' ' + row[5]) addresscheck = usaddress.tag(addressstring) if addresscheck[1] != 'Ambiguous': if addressstring.find('University at Buffalo') != -1: buffadd.loc[len(buffadd)]=df.iloc[index] else: usadd.loc[len(usadd)]= df.iloc[index] else: if addressstring.find("University at Buffalo") != -1: wrongbuff.loc[len(wrongbuff)] = df.iloc[index] else: wrongaddress.loc[len(wrongaddress)] = df.iloc[index]
def parse_with_usaddress_tag(self, addr_str): """ Parses address string using usaddress's `tag()` function """ try: tagged = usaddress.tag(addr_str)[0].items() except usaddress.RepeatedLabelError: # FIXME: Add richer logging here with contents of `rle` or chain exception w/ Python 3 # FIXME: Shouldn't leak details of 'tag' method since it not longer a param raise AddressParserError("Could not parse address '{}' with 'tag' method".format(addr_str)) addr_parts = [{'code': self.standard_part_mapping[k], 'value': v} for k, v in tagged] return addr_parts
def listings(address,filters): parsed = usaddress.tag(address)[0] listings =[] if 'ZipCode' in parsed: listings = list(Listing.query.filter_by(zipcode=parsed['ZipCode']).order_by(Listing.timestamp.desc())) elif 'PlaceName' and 'StateName' in parsed: parsed['StateName'] = parsed['StateName'].upper() if 'StreetNamePreDirectional' and 'StreetName' in parsed: parsed['PlaceName'] = parsed['StreetNamePreDirectional'] + " " + parsed['StreetName'] + " " + parsed['PlaceName'] listings = list(Listing.query.filter_by(city=parsed['PlaceName'],state=parsed['StateName']).order_by(Listing.timestamp.desc())) else: flash('Please enter a valid address including either a zipcode or city name and state.') listings = apply_filters(listings, filters) return render_template('listings.html',act_filters=filters,query=address, lots=False,address=address,favs=False,search=g.search, searchbar=True, listings=listings, count=len(listings))
def rend(self): if not self.addr.find(';') == -1: temp = self.addr.split(';') if temp[1] == '': try: if 'PlaceName' not in usaddress.tag(temp[0])[0] or 'StateName' not in usaddress.tag(temp[0])[0]: return temp[0], None else: return '', temp[0] except usaddress.RepeatedLabelError: return temp[0], '' return temp[0], temp[1] else: return self.addr, ''
def addresscheck(df,usadd,wrongaddress): for index, row in df.iterrows(): addressstring = (unicode(row['Address 1'])+' '+ unicode(row['Address 2'])+' ' + unicode(row['City'])+' ' + unicode(row['State']) + ' ' + unicode(row['Zipcode'])) try: addresscheck = usaddress.tag(addressstring) except usaddress.RepeatedLabelError: wrongaddress.loc[len(wrongaddress)] = df.iloc[index] if addresscheck[1] != 'Ambiguous': if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1: usadd.loc[len(usadd)]= df.iloc[index] else: if addressstring.find('14261') == -1 and addressstring.find('14260') == -1 and addressstring.find('Clement') == -1 and addressstring.find('Goodyear') == -1: wrongaddress.loc[len(wrongaddress)] = df.iloc[index]
def clean_street(address1, address2='', zipcode=None): """ Clean street address. """ # matchers that rely on knowing address lines # Strip "care of" lines if re.match(r'c/o ', address1, re.IGNORECASE): return address2 # concatenate line 1 and line 2 address = '{} {}'.format(address1, address2).strip() if not address: return address try: # Add arbitrary city/state/zip to get `usaddress` to parse the address # as just the street address and not a full address address_to_parse = '{}, Austin, TX {}'.format(address, zipcode) if zipcode else address addr, type_ = usaddress.tag(address_to_parse) except usaddress.RepeatedLabelError: logger.warn('Unparseable address: {}'.format(address_to_parse)) return address if zipcode: try: addr.pop('PlaceName', None) addr.pop('StateName', None) guessed_zip = addr.pop('ZipCode') assert guessed_zip == zipcode except KeyError: logger.warn('Expected a zipcode {} but found none'.format('zipcode')) except AssertionError: logger.warn('Guessed the wrong zipcode {} != {}' .format(guessed_zip, zipcode)) if type_ == 'Street Address': return ' '.join( [component_format(label, value) for label, value in addr.items()]) elif type_ == 'PO Box': return ' '.join( [component_format(label, value) for label, value in addr.items()]) logger.warn('Ambiguous address: {}'.format(address), extra=addr) return address
def create_address_url(raw_address_text): raw_address_parsed = usaddress.tag(raw_address_text) address_ordered_dict = raw_address_parsed[0] address_keys = ['AddressNumber','StreetName','StreetNamePostType','OccupancyType','OccupancyIdentifier'] address_string_list=[] for key in address_keys: if address_ordered_dict.get(key) is not None: address_string_list.append(address_ordered_dict[key]) address_string = ' '.join(address_string_list) address_url_encode = address_string.replace(' ','+').strip() citystatezip_string = address_ordered_dict.get('PlaceName','') citystatezip_string += '%2C ' + address_ordered_dict.get('StateName','') citystatezip_string += ' ' + address_ordered_dict.get('ZipCode','') citystatezip_url_encode = citystatezip_string.strip().replace(' ','+') address_for_walkscore = address_url_encode + "," + citystatezip_url_encode return address_url_encode, citystatezip_url_encode, address_for_walkscore
def _normalize_address_str(address_val): """ Normalize the address to conform to short abbreviations. If an invalid address_val is provided, None is returned. If a valid address is provided, a normalized version is returned. """ # if this string is empty the regular expression in the sa wont # like it, and fail, so leave returning nothing if not address_val: return None # now parse the address into number, street name and street type addr = usaddress.tag(str(address_val))[0] # TODO: should probably use unicode() normalized_address = '' if not addr: return None if 'AddressNumber' in addr and addr['AddressNumber'] is not None: normalized_address = addr['AddressNumber'].lstrip("0") # some addresses have leading zeros, strip them here if 'StreetNamePreDirectional' in addr and addr['StreetNamePreDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePreDirectional']) if 'StreetName' in addr and addr['StreetName'] is not None: normalized_address = normalized_address + ' ' + addr['StreetName'] if 'StreetNamePostType' in addr and addr['StreetNamePostType'] is not None: # remove any periods from abbreviations normalized_address = normalized_address + ' ' + _normalize_address_post_type(addr['StreetNamePostType']) if 'StreetNamePostDirectional' in addr and addr['StreetNamePostDirectional'] is not None: normalized_address = normalized_address + ' ' + _normalize_address_direction(addr['StreetNamePostDirectional']) formatter = StreetAddressFormatter() normalized_address = formatter.abbrev_street_avenue_etc(normalized_address) return normalized_address.lower().strip()
def __parse(self, d): from usaddress import tag try: # Note that we replace . with space addr = tag(d['address'].replace('.', ' ').upper())[0] except Exception: raise Exception('Unable to parse address %s' % (d['address'],)) if 'StreetName' not in addr: return d['street_name'] = addr['StreetName'].replace(' ', '') if 'AddressNumber' in addr: d['house_number'] = StrLib.extract_numeric(addr['AddressNumber']) if not d['house_number'].isnumeric(): d['street_name'] = '%s %s' % (addr['AddressNumber'], d['street_name']) d['house_number'] = '' if 'StreetNamePreType' in addr: d['street_name'] = '%s%s' % (addr['StreetNamePreType'], d['street_name']) if 'StreetNamePreDirectional' in addr: d['pre_direction'] = addr['StreetNamePreDirectional'].replace('.', '') if d['pre_direction'] not in self.__directions: d['street_name'] = '%s %s' % (d['pre_direction'], d['street_name']) d['pre_direction'] = '' if 'StreetNamePostType' in addr: d['street_type'] = addr['StreetNamePostType'].replace('.', '') if d['street_type'] not in street_abbrs and \ d['street_type'] not in street_abbrs.values(): d['street_name'] = '%s%s' % (d['street_name'], d['street_type']) d['street_type'] = '' if 'StreetNamePostDirectional' in addr: d['suf_direction'] = addr['StreetNamePostDirectional'].replace('.', '') if d['suf_direction'] not in self.__directions: d['street_name'] = '%s %s' % (d['street_name'], d['suf_direction']) d['suf_direction'] = None if 'OccupancyIdentifier' in addr: d['unit'] = addr['OccupancyIdentifier']
def main(**kwargs): """The main function of the script. Performs most of the address parsing work and all output. :param input: a text file object that will be read from. Should contain address-like data, one address per line :param output: a text file object where parsed output will be written. Parsed output will be similar to CSV data :param remove_post_zip: a boolean value whether to remove data on each line following a sequence of 5 digits and a comma :type input: text file object in read mode :type output: text file object in write mode :type remove_post_zip: bool """ lines = [line.replace('"', '').strip() for line in kwargs['input']] kwargs['input'].close() print('{} lines in input'.format(len(lines))) county = re.compile('(\d{5}),.*') parsed = [] errored = [] for line in lines: try: parsed.append(usaddress.tag(re.sub(county, r'\1', line))) except(usaddress.RepeatedLabelError): errored.append(line) for address in parsed: kwargs['output'].write(','.join(parsed_address_to_human(address)) + '\n') kwargs['output'].close() for error in errored: sys.stderr.write('{}\n'.format(error)) if len(errored): sys.stderr.write('{} lines unable to be parsed\n'.format(len(errored))) return 0
def __init__(self, address_str): # Retaining the raw string self.address_str = address_str # # The tag method will try to be a little smarter # # it will merge consecutive components, strip commas, & return an address type # # expected output: (OrderedDict([('AddressNumber', u'123'), ('StreetName', u'Main'), # # ('StreetNamePostType', u'St.'), ('OccupancyType', u'Suite'), ('OccupancyIdentifier', u'100'), # ('PlaceName', u'Chicago'), ('StateName', u'IL')]), 'Street Address') address_dict = usaddress.tag(address_str) # The if conditional is needed to avoid dictionary's key error if the key doesn't exist self.addressNumber = address_dict['AddressNumber'] if 'AddressNumber' in address_dict.keys() \ else raise ValueError('Address AddressNumber not provided') # noinspection PyUnreachableCode self.streetName = address_dict['StreetName'] if 'StreetName' in address_dict.keys() \ else raise ValueError('Address StreetName not provided') self.streetNamePostType = address_dict['StreetNamePostType'] if 'StreetNamePostType' in address_dict.keys() \ else raise ValueError('Address StreetNamePostType not provided') self.placeName = address_dict['PlaceName'] if 'PlaceName' in address_dict.keys() \ else raise ValueError('Address PlaceName not provided') self.stateName = address_dict['StateName'] if 'StateName' in address_dict.keys() \ else raise ValueError('Address StateName not provided') # Optional values self.occupancyIdentifier = address_dict[ 'OccupancyIdentifier'] if 'OccupancyIdentifier' in address_dict.keys() \ else raise ValueError('Address OccupancyIdentifier not provided') self.occupancyType = address_dict['OccupancyType'] if 'OccupancyType' in address_dict.keys() \ else raise ValueError('Address OccupancyType not provided') self.streetNamePreDirectional = address_dict[ 'StreetNamePreDirectional'] if 'StreetNamePreDirectional' in address_dict.keys() \ else None self.zipCode = address_dict['ZipCode'] if 'ZipCode' in address_dict.keys() else None
def region_parse(self, region): # Scans for international zip codes, using the regex forms stored in the array below. regex_base = ['NSW \d{4}','\W(GIR|[A-Z]\d[A-Z\d]??|[A-Z]{2}\d[A-Z\d]??)[ ]??(\d[A-Z]{2})\W', '\W((?:0[1-46-9]\d{3})|(?:[1-357-9]\d{4})|(?:[4][0-24-9]\d{3})|(?:[6][013-9]\d{3}))\W', '\W([ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ])\ {0,1}(\d[ABCEGHJKLMNPRSTVWXYZ]\d)\W', '\W(F-)?((2[A|B])|[0-9]{2})[0-9]{3}\W', '\W(V-|I-)?[0-9]{5}\W', '\W[^\W\d_]{2}-\d{4}\W', '\W\d{6}\W', '\W(0[289][0-9]{2})|([1345689][0-9]{3})|(2[0-8][0-9]{2})|(290[0-9])|(291[0-4])|(7[0-4][0-9]{2})|(7[8-9][0-9]{2})\W', '\W[1-9][0-9]{3}\s?([a-zA-Z]{2})?\W', '\W([1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3}\W', '\W([D-d][K-k])?( |-)?[1-9]{1}[0-9]{3}\W', '\W(s-|S-){0,1}[0-9]{3}\s?[0-9]{2}\W', '\W[1-9]{1}[0-9]{3}\W', '\W[^\W\d_]{2}\d \d[^\W\d_]{2}\W', '\W[^\W\d_]{2}/d[^\W\d_] /d[^\W\d_]{2}\W', '\W\d{3}-\d{4}\W', '\W\d{3}-\d{3}\W', '\W[^\W\d_]{2}\d \d[^\W\d_]\W', '\W[^\W\d_]\d{6}\W', '\W[^\d_]{2}\d\W', '\W[^\W\d_]{2}-\d{4}\W', '\W[^\W\d_]{2}\d \d[^\W\d_]{2}\W', '\W[^\W\d_]{3} \d{4}\W', '\W[^\W\d_]-\d{4}\W', '\W[^\W\d_]\d{2} [^\W\d_]{3}\W'] # Initialize this with blank values but valid keys, so I can just break out of this method while returning the # appropriate data structure if I need to. filtered_region = {'City': '', 'State': '', 'Zipcode': ''} # Initialized as an empty variable so that the method can also use it as a boolean if it is never called zippy = None # Must go before basic parsing, or else blank spaces will throw it off. if region is None: return filtered_region region = ' '.join(region.split()) # Scanning for PO numbers in the latter part of the address. If one is found, the same procedure as # the zipcodes is followed. try: holder = usaddress.tag(region) if 'USPSBoxType' in holder[0] and 'USPSBoxID' in holder[0]: po = "%s %s" % (holder[0]['USPSBoxType'], holder[0]['USPSBoxID']) region = region.replace(po, '') filtered_region['PO'] = po except usaddress.RepeatedLabelError: pass dublin = re.search("dublin \d(\d)?", region.lower()) if dublin: dubb = dublin.group(0) dub = dubb.split(' ') filtered_region['City'] = dub[0].capitalize() filtered_region['Zipcode'] = dub[1] region = ((region.lower()).replace(dubb, ',')).capitalize() return self.international(region, filtered_region) result = re.search("\W\d{5}([\-]?\d{4})?\W", " %s " % region) if not result: # Finds the international zipcodes, and then strips them from the region string for reg in regex_base: print(region) region = ' '.join(region.split()) results = re.search(reg, " %s " % region) if results: zippy = (results.group(0)).strip() filtered_region['Zipcode'] = zippy # print(zippy) region = region.replace(zippy, ',') break if not zippy: # Back to American Parsing. These addresses that make it down here should all be domestic. filtered_region = self.domestic(region, filtered_region) return filtered_region else: # American Parsing. These addresses that make it down here should all be domestic. filtered_region = self.domestic(region, filtered_region) return filtered_region # International Parsing. Special cases for them. if zippy: return self.international(region, filtered_region) return filtered_region
import usaddress, sys, json addr=sys.argv[1] addr_parsed = usaddress.tag(addr) print(json.dumps(addr_parsed))
def domestic(region, filtered_region): region = region.strip() if not ' ' in region and not ',' in region: filtered_region['City'] = region return filtered_region citi = None # Because Washington DC has some odd formatting and weird state issues (in maryland, but not technically in # maryland), I just created a special case that deals with this string. Uses fuzzy string reading to ID. if fuzz.partial_ratio(region, 'Washington D.C') > 90: citi = 'Washington, D.C.' region = region.replace('Washington', '') region = region.replace('DC', '') region = region.replace('D.C.', '') # The meat of the US address parsing. If the parsing returns an error, nothing is returned and it will end up # in the problem file stack. try: region = region.strip(',') clean = usaddress.tag(region) except usaddress.RepeatedLabelError: return filtered_region # Street addresses returned by the parser may contain some or none of these elements, which is why they are # initially placed into an array to be combined later. This snippet of code is here, in addition to the first # section so that it can catch any misplaced addresses. address = [clean[0].setdefault('AddressNumber', ''), clean[0].setdefault('StreetNamePreDirectional', ''), clean[0].setdefault('StreetNamePreModifier', ''), clean[0].setdefault('StreetName', ''), clean[0].setdefault('StreetNamePostType', ''), clean[0].setdefault('StreetNamePostDirectional', '')] if address is not None: address = ' '.join(address) address = ' '.join(address.split()) filtered_region['Street'] = address # Reassigns variable values to the dictionary. If these values didn't exist in the firm place within the # parser, nothing is done. The setdefault function only works with dictionaries. filtered_region['State'] = clean[0].setdefault('StateName', '') filtered_region['Zipcode'] = clean[0].setdefault('ZipCode', '') # Enters Washington DC if citi: filtered_region['City'] = citi else: filtered_region['City'] = clean[0].setdefault('PlaceName', '') # Adds on the PO box stuff into the dictionary. if 'USPSBoxType' in clean[0] and 'USPSBoxID' in clean[0]: filtered_region['PO'] = "%s %s" % (clean[0]['USPSBoxType'], clean[0]['USPSBoxID']) if 'CountryName' in clean[0]: filtered_region['Country'] = clean[0]['CountryName'] if 'Recipient' in clean[0]: filtered_region['Recipient'] = clean[0]['Recipient'] if 'LandmarkName' in clean[0]: filtered_region['LandmarkName'] = clean[0]['LandmarkName'] return filtered_region
import usaddress from usaddress import RepeatedLabelError import csv readfile = open('uscleaned.csv', 'rb') reader = csv.reader(readfile) new_rows = [] problem_rows = [] for row in reader: row[4] = row[4].replace('US', '') print row[4] try: address = usaddress.tag(row[4])[0] try: city = address['PlaceName'] except KeyError: city = '' try: state = address['StateName'] except KeyError: state = '' try: zipcode = address['ZipCode'] except KeyError: zipcode = '' try: country = address['Country'] except KeyError: country = '' address['PlaceName'] = None address['StateName'] = None
def tagger(self, field) : return usaddress.tag(field)
def test_broadway(self): s1 = '1775 Broadway And 57th, Newyork NY' usaddress.tag(s1)
def normalize_attorney_contact(c, fallback_name=''): """Normalize the contact string for an attorney. Attorney contact strings are newline separated addresses like: Landye Bennett Blumstein LLP 701 West Eighth Avenue, Suite 1200 Anchorage, AK 99501 907-276-5152 Email: [email protected] We need to pull off email and phone numbers, and then our address parser should work nicely. """ atty_info = { 'email': '', 'fax': '', 'phone': '', } if not c: return {}, atty_info address_lines = [] lines = c.split('\n') for i, line in enumerate(lines): line = re.sub('Email:\s*', '', line).strip() line = re.sub('pro se', '', line, flags=re.I) if not line: continue try: validate_email(line) except ValidationError: # Not an email address, press on. pass else: # An email address. atty_info['email'] = line continue # Perhaps a phone/fax number? clean_line = re.sub(r'(\(|\)|\\|/|\s+)', '', line) if clean_line.startswith('Fax:'): clean_line = re.sub('Fax:', '', clean_line) m = phone_digits_re.search(clean_line) if m: atty_info['fax'] = normalize_us_phone_number(clean_line) continue else: m = phone_digits_re.search(clean_line) if m: atty_info['phone'] = normalize_us_phone_number(clean_line) continue # First line containing an ampersand? These are usually law firm names. if u'&' in line and i == 0: fallback_name = line continue has_chars = re.search('[a-zA-Z]', line) if has_chars: # Not email, phone, fax, and has at least one char. address_lines.append(line) mapping = { 'Recipient': 'name', 'AddressNumber': 'address1', 'AddressNumberPrefix': 'address1', 'AddressNumberSuffix': 'address1', 'StreetName': 'address1', 'StreetNamePreDirectional': 'address1', 'StreetNamePreModifier': 'address1', 'StreetNamePreType': 'address1', 'StreetNamePostDirectional': 'address1', 'StreetNamePostModifier': 'address1', 'StreetNamePostType': 'address1', # When corner addresses are given, you have two streets in an address 'SecondStreetName': 'address1', 'SecondStreetNamePreDirectional': 'address1', 'SecondStreetNamePreModifier': 'address1', 'SecondStreetNamePreType': 'address1', 'SecondStreetNamePostDirectional': 'address1', 'SecondStreetNamePostModifier': 'address1', 'SecondStreetNamePostType': 'address1', 'CornerOf': 'address1', 'IntersectionSeparator': 'address1', 'LandmarkName': 'address1', 'USPSBoxGroupID': 'address1', 'USPSBoxGroupType': 'address1', 'USPSBoxID': 'address1', 'USPSBoxType': 'address1', 'BuildingName': 'address2', 'OccupancyType': 'address2', 'OccupancyIdentifier': 'address2', 'SubaddressIdentifier': 'address2', 'SubaddressType': 'address2', 'PlaceName': 'city', 'StateName': 'state', 'ZipCode': 'zip_code', 'ZipPlus4': 'zip_code', } try: address_info, address_type = usaddress.tag( u', '.join(address_lines), tag_mapping=mapping, ) except (usaddress.RepeatedLabelError, UnicodeEncodeError): # See https://github.com/datamade/probableparsing/issues/2 for why we # catch the UnicodeEncodeError. Oy. logger.warn("Unable to parse address (RepeatedLabelError): %s" % ', '.join(c.split('\n'))) return {}, atty_info # We don't want this getting through to the database layer. Pop it. address_info.pop('NotAddress', None) if any([address_type == 'Ambiguous', 'CountryName' in address_info]): logger.warn("Unable to parse address (Ambiguous address type): %s" % ', '.join(c.split('\n'))) return {}, atty_info if address_info.get('name') is None and fallback_name: address_info['name'] = fallback_name if address_info.get('state'): address_info['state'] = normalize_us_state(address_info['state']) address_info = normalize_address_info(dict(address_info)) address_info['lookup_key'] = make_address_lookup_key(address_info) return address_info, atty_info