def normalize_us_phone_number(phone): """Tidy up phone numbers so they're nice.""" phone = re.sub('(\(|\)|\s+)', '', phone) m = phone_digits_re.search(phone) if m: return '(%s) %s-%s' % (m.group(1), m.group(2), m.group(3)) return ''
def normalize_us_phone_number(phone): """Tidy up phone numbers so they're nice.""" phone = re.sub(r"(\(|\)|\s+)", "", phone) m = phone_digits_re.search(phone) if m: return f"({m.group(1)}) {m.group(2)}-{m.group(3)}" return ""
def normalize_us_phone_number(phone): """Tidy up phone numbers so they're nice.""" phone = re.sub('(\(|\)|\s+)', '', phone) m = phone_digits_re.search(phone) if m: return '(%s) %s-%s' % (m.group(1), m.group(2), m.group(3)) return ''
def to_python(self, value): if value in EMPTY_VALUES: return u'' value = smart_unicode(value) m = phone_digits_re.search(value) if m: return u'-'.join(m.groups()) return value
def normalize_attorney_contact(c, fallback_name=''): """Normalize the contact string for an attorney. Attorney contact strings are newline separated addresses like: Landye Bennett Blumstein LLP 701 West Eighth Avenue, Suite 1200 Anchorage, AK 99501 907-276-5152 Email: [email protected] We need to pull off email and phone numbers, and then our address parser should work nicely. """ atty_info = { 'email': '', 'fax': '', 'phone': '', } if not c: return {}, atty_info address_lines = [] lines = c.split('\n') for i, line in enumerate(lines): line = re.sub('Email:\s*', '', line).strip() line = re.sub('pro se', '', line, flags=re.I) if not line: continue try: validate_email(line) except ValidationError: # Not an email address, press on. pass else: # An email address. atty_info['email'] = line continue # Perhaps a phone/fax number? clean_line = re.sub(r'(\(|\)|\\|/|\s+)', '', line) if clean_line.startswith('Fax:'): clean_line = re.sub('Fax:', '', clean_line) m = phone_digits_re.search(clean_line) if m: atty_info['fax'] = clean_line continue else: m = phone_digits_re.search(clean_line) if m: atty_info['phone'] = clean_line continue # First line containing an ampersand? These are usually law firm names. if u'&' in line and i == 0: fallback_name = line continue has_chars = re.search('[a-zA-Z]', line) if has_chars: # Not email, phone, fax, and has at least one char. address_lines.append(line) mapping = { 'Recipient': 'name', 'AddressNumber': 'address1', 'AddressNumberPrefix': 'address1', 'AddressNumberSuffix': 'address1', 'StreetName': 'address1', 'StreetNamePreDirectional': 'address1', 'StreetNamePreModifier': 'address1', 'StreetNamePreType': 'address1', 'StreetNamePostDirectional': 'address1', 'StreetNamePostModifier': 'address1', 'StreetNamePostType': 'address1', # When corner addresses are given, you have two streets in an address 'SecondStreetName': 'address1', 'SecondStreetNamePreDirectional': 'address1', 'SecondStreetNamePreModifier': 'address1', 'SecondStreetNamePreType': 'address1', 'SecondStreetNamePostDirectional': 'address1', 'SecondStreetNamePostModifier': 'address1', 'SecondStreetNamePostType': 'address1', 'CornerOf': 'address1', 'IntersectionSeparator': 'address1', 'LandmarkName': 'address1', 'USPSBoxGroupID': 'address1', 'USPSBoxGroupType': 'address1', 'USPSBoxID': 'address1', 'USPSBoxType': 'address1', 'BuildingName': 'address2', 'OccupancyType': 'address2', 'OccupancyIdentifier': 'address2', 'SubaddressIdentifier': 'address2', 'SubaddressType': 'address2', 'PlaceName': 'city', 'StateName': 'state', 'ZipCode': 'zip_code', 'ZipPlus4': 'zip_code', } try: address_info, address_type = usaddress.tag( ', '.join(address_lines), tag_mapping=mapping, ) except usaddress.RepeatedLabelError: logger.warn("Unable to parse address (RepeatedLabelError): %s" % ', '.join(c.split('\n'))) return {}, atty_info # We don't want this getting through to the database layer. Pop it. address_info.pop('NotAddress', None) if any([address_type == 'Ambiguous', 'CountryName' in address_info]): logger.warn("Unable to parse address (Ambiguous address type): %s" % ', '.join(c.split('\n'))) return {}, atty_info if address_info.get('name') is None and fallback_name: address_info['name'] = fallback_name if address_info.get('state'): address_info['state'] = normalize_us_state(address_info['state']) address_info = normalize_address_info(dict(address_info)) address_info['lookup_key'] = make_address_lookup_key(address_info) return address_info, atty_info
def normalize_attorney_contact(c, fallback_name=""): """Normalize the contact string for an attorney. Attorney contact strings are newline separated addresses like: Landye Bennett Blumstein LLP 701 West Eighth Avenue, Suite 1200 Anchorage, AK 99501 907-276-5152 Email: [email protected] We need to pull off email and phone numbers, and then our address parser should work nicely. """ atty_info = { "email": "", "fax": "", "phone": "", } if not c: return {}, atty_info address_lines = [] lines = c.split("\n") for i, line in enumerate(lines): line = re.sub("Email:\s*", "", line).strip() line = re.sub("pro se", "", line, flags=re.I) if not line: continue try: validate_email(line) except ValidationError: # Not an email address, press on. pass else: # An email address. atty_info["email"] = line continue # Perhaps a phone/fax number? clean_line = re.sub(r"(\(|\)|\\|/|\s+)", "", line) if clean_line.startswith("Fax:"): clean_line = re.sub("Fax:", "", clean_line) m = phone_digits_re.search(clean_line) if m: atty_info["fax"] = normalize_us_phone_number(clean_line) continue else: m = phone_digits_re.search(clean_line) if m: atty_info["phone"] = normalize_us_phone_number(clean_line) continue # First line containing an ampersand? These are usually law firm names. if u"&" in line and i == 0: fallback_name = line continue has_chars = re.search("[a-zA-Z]", line) if has_chars: # Not email, phone, fax, and has at least one char. address_lines.append(line) mapping = { "Recipient": "name", "AddressNumber": "address1", "AddressNumberPrefix": "address1", "AddressNumberSuffix": "address1", "StreetName": "address1", "StreetNamePreDirectional": "address1", "StreetNamePreModifier": "address1", "StreetNamePreType": "address1", "StreetNamePostDirectional": "address1", "StreetNamePostModifier": "address1", "StreetNamePostType": "address1", # When corner addresses are given, you have two streets in an address "SecondStreetName": "address1", "SecondStreetNamePreDirectional": "address1", "SecondStreetNamePreModifier": "address1", "SecondStreetNamePreType": "address1", "SecondStreetNamePostDirectional": "address1", "SecondStreetNamePostModifier": "address1", "SecondStreetNamePostType": "address1", "CornerOf": "address1", "IntersectionSeparator": "address1", "LandmarkName": "address1", "USPSBoxGroupID": "address1", "USPSBoxGroupType": "address1", "USPSBoxID": "address1", "USPSBoxType": "address1", "BuildingName": "address2", "OccupancyType": "address2", "OccupancyIdentifier": "address2", "SubaddressIdentifier": "address2", "SubaddressType": "address2", "PlaceName": "city", "StateName": "state", "ZipCode": "zip_code", "ZipPlus4": "zip_code", } try: address_info, address_type = usaddress.tag(u", ".join(address_lines), tag_mapping=mapping) except (usaddress.RepeatedLabelError, UnicodeEncodeError): # See https://github.com/datamade/probableparsing/issues/2 for why we # catch the UnicodeEncodeError. Oy. logger.warning("Unable to parse address (RepeatedLabelError): %s" % ", ".join(c.split("\n"))) return {}, atty_info # We don't want this getting through to the database layer. Pop it. address_info.pop("NotAddress", None) if any([address_type == "Ambiguous", "CountryName" in address_info]): logger.warning("Unable to parse address (Ambiguous address type): %s" % ", ".join(c.split("\n"))) return {}, atty_info if address_info.get("name") is None and fallback_name: address_info["name"] = fallback_name if address_info.get("state"): address_info["state"] = normalize_us_state(address_info["state"]) address_info = normalize_address_info(dict(address_info)) address_info["lookup_key"] = make_address_lookup_key(address_info) return address_info, atty_info
def normalize_attorney_contact(c, fallback_name=''): """Normalize the contact string for an attorney. Attorney contact strings are newline separated addresses like: Landye Bennett Blumstein LLP 701 West Eighth Avenue, Suite 1200 Anchorage, AK 99501 907-276-5152 Email: [email protected] We need to pull off email and phone numbers, and then our address parser should work nicely. """ atty_info = { 'email': '', 'fax': '', 'phone': '', } if not c: return {}, atty_info address_lines = [] lines = c.split('\n') for i, line in enumerate(lines): line = re.sub('Email:\s*', '', line).strip() line = re.sub('pro se', '', line, flags=re.I) if not line: continue try: validate_email(line) except ValidationError: # Not an email address, press on. pass else: # An email address. atty_info['email'] = line continue # Perhaps a phone/fax number? clean_line = re.sub(r'(\(|\)|\\|/|\s+)', '', line) if clean_line.startswith('Fax:'): clean_line = re.sub('Fax:', '', clean_line) m = phone_digits_re.search(clean_line) if m: atty_info['fax'] = normalize_us_phone_number(clean_line) continue else: m = phone_digits_re.search(clean_line) if m: atty_info['phone'] = normalize_us_phone_number(clean_line) continue # First line containing an ampersand? These are usually law firm names. if u'&' in line and i == 0: fallback_name = line continue has_chars = re.search('[a-zA-Z]', line) if has_chars: # Not email, phone, fax, and has at least one char. address_lines.append(line) mapping = { 'Recipient': 'name', 'AddressNumber': 'address1', 'AddressNumberPrefix': 'address1', 'AddressNumberSuffix': 'address1', 'StreetName': 'address1', 'StreetNamePreDirectional': 'address1', 'StreetNamePreModifier': 'address1', 'StreetNamePreType': 'address1', 'StreetNamePostDirectional': 'address1', 'StreetNamePostModifier': 'address1', 'StreetNamePostType': 'address1', # When corner addresses are given, you have two streets in an address 'SecondStreetName': 'address1', 'SecondStreetNamePreDirectional': 'address1', 'SecondStreetNamePreModifier': 'address1', 'SecondStreetNamePreType': 'address1', 'SecondStreetNamePostDirectional': 'address1', 'SecondStreetNamePostModifier': 'address1', 'SecondStreetNamePostType': 'address1', 'CornerOf': 'address1', 'IntersectionSeparator': 'address1', 'LandmarkName': 'address1', 'USPSBoxGroupID': 'address1', 'USPSBoxGroupType': 'address1', 'USPSBoxID': 'address1', 'USPSBoxType': 'address1', 'BuildingName': 'address2', 'OccupancyType': 'address2', 'OccupancyIdentifier': 'address2', 'SubaddressIdentifier': 'address2', 'SubaddressType': 'address2', 'PlaceName': 'city', 'StateName': 'state', 'ZipCode': 'zip_code', 'ZipPlus4': 'zip_code', } try: address_info, address_type = usaddress.tag( u', '.join(address_lines), tag_mapping=mapping, ) except (usaddress.RepeatedLabelError, UnicodeEncodeError): # See https://github.com/datamade/probableparsing/issues/2 for why we # catch the UnicodeEncodeError. Oy. logger.warn("Unable to parse address (RepeatedLabelError): %s" % ', '.join(c.split('\n'))) return {}, atty_info # We don't want this getting through to the database layer. Pop it. address_info.pop('NotAddress', None) if any([address_type == 'Ambiguous', 'CountryName' in address_info]): logger.warn("Unable to parse address (Ambiguous address type): %s" % ', '.join(c.split('\n'))) return {}, atty_info if address_info.get('name') is None and fallback_name: address_info['name'] = fallback_name if address_info.get('state'): address_info['state'] = normalize_us_state(address_info['state']) address_info = normalize_address_info(dict(address_info)) address_info['lookup_key'] = make_address_lookup_key(address_info) return address_info, atty_info