def parse_name(self, name_raw): """ Parses a raw_name, e.g. "Corbato, F.J." into first, middle, and last name >>> p=Person(name_raw='Corbato, F.J.') >>> p.last, p.first, p.middle ('Corbató', 'F', 'J') :param name_raw: :return: """ # fix names like "Verzuh M., F.", where the middle name comes after the last name # -> it should be Verzuh, F. M. match = re.match('([A-Z][a-z]+) ([A-Z])\., ([A-Z][a-z]*)\.*', name_raw) if match: name_raw = f'{match.groups()[0]}, {match.groups()[2]}. {match.groups()[1]}.' name = HumanName(name_raw) # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match('[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.strip('.').capitalize() name.middle = name.middle.strip('.').capitalize() last_name_replacements = [('Corbato', 'Corbató'), ('Corbatò', 'Corbató'), ('Verguh', 'Verzuh')] for replacement in last_name_replacements: name.last = name.last.replace(replacement[0], replacement[1]) return name.last, name.first, name.middle
def sort_contributor(self, c: ParsedSegment, default_type=None): ''' Sort a contributor into lists based on agent type. ''' name = HumanName(c.name) initials = ''.join(rgx.abbr.findall(c.name)) _type = default_type if name.last in self.contributors['person']: _type = 'person' else: for k, v in self.contributors.items(): if k == 'person': continue if initials in v: if any([x.name == c.name for x in v[initials]]): _type = k if _type is None: i = multi_choice( 'What type of contributor is "{0}"?'.format(c.name), self.contributors.keys()) _type = list(self.contributors.keys())[i] if _type == 'person': if name.last == '': name.last = name.first name.first = '?' if name.first == '': name.first = '?' c.name = name family_name_records = self.contributors[_type].get(name.last, {}) initial_records = family_name_records.get(name.first[0], []) + [c] family_name_records[name.first[0]] = initial_records self.contributors[_type][name.last] = family_name_records else: self.contributors[_type][initials] = self.contributors[_type].get( initials, []) + [c]
def normalize_name(first_name, last_name): """Normalizes capitalization of first and last name.""" name = HumanName() name.first = first_name name.last = last_name name.capitalize() return (name.first, name.last)
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" self.m(hn.last,"de la Vega", hn) hn.title = "test" self.m(hn.title,"test", hn) hn.first = "test" self.m(hn.first,"test", hn) hn.middle = "test" self.m(hn.middle,"test", hn) hn.suffix = "test" self.m(hn.suffix,"test", hn)
def test_assign_list_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.title = ["test1", "test2"] assert hn.title == "test1 test2" hn.first = ["test3", "test4"] assert hn.first == "test3 test4" hn.middle = ["test5", "test6", "test7"] assert hn.middle == "test5 test6 test7" hn.last = ["test8", "test9", "test10"] assert hn.last == "test8 test9 test10" hn.suffix = ["test"] assert hn.suffix == "test"
def _massage_measure_donor_name(self, name_string): """ """ name = HumanName(name_string) name.first = name.first.title() name.last = name.last.title() if name.middle: name.middle = name.middle.replace(".", "") name.middle = "%s." % (name.middle.title()) if name == "JR. Munger CHARLES T.": name.first = "Charles" name.middle = "T." name.last = "Munger" name.suffix = "Jr." if name == "M. Quinn. Delaney": name.first = "M." name.middle = "Quinn" name.last = "Delaney" name.suffix = None if name == "Robert Alan. Eustace": name.first = "Robert" name.middle = "Alan" name.last = "Eustace" name.suffix = None if name == "Susie Tompkins. Buell": name.first = "Susie" name.middle = "Tompkins" name.last = "Buell" name.suffix = None if name.middle and name.suffix: output = "%s %s %s %s" % (name.first, name.middle, name.last, name.suffix) if name.middle: output = "%s %s %s" % (name.first, name.middle, name.last) elif name.suffix: output = "%s %s %s" % (name.first, name.last, name.suffix) else: output = "%s %s" % (name.first, name.last) return output
def from_parts(cls, first=None, last=None, middle=None, suffix=None, title=None): name = HumanName() name.first = first name.middle = middle name.last = last name.suffix = suffix name.title = title return ParsedName(name)
def clean_names(dirty_names): from nameparser import HumanName import string names = [] for dude in dirty_names: name = HumanName( dude.translate(str.maketrans('', '', string.punctuation))) if not name.first: name.first = name.title names.append(name.first + ' ' + name.last) return names
def _massage_payload(self, payload): for k, v in payload.items(): if pd.isnull(v) or not v: # Replace nan or None with empty string. payload[k] = "" # Ensure names aren't all caps or all lowercase. if payload.get("firstname") and payload.get("lastname"): name = HumanName() name.first = payload["firstname"] name.last = payload["lastname"] name.capitalize() payload["firstname"] = name.first payload["lastname"] = name.last
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" assert hn.last == "de la Vega" hn.title = "test" assert hn.title == "test" hn.first = "test" assert hn.first == "test" hn.middle = "test" assert hn.middle == "test" hn.suffix = "test" assert hn.suffix == "test" with pytest.raises(TypeError): hn.suffix = [["test"]] with pytest.raises(TypeError): hn.suffix = {"test": "test"}
def HumanNameFmXML(self, ell): hn = HumanName() for el in ell: if el.tag == 'First': hn.first = el.text elif el.tag == 'Middle': hn.middle = el.text elif el.tag == 'Last': hn.last = el.text elif el.tag == 'Title': hn.title = el.text elif el.tag == 'Suffix': hn.suffix = el.text elif el.tag == 'NickName': hn.nickname = el.text else: pass return hn
def initContact(contactId: str): assert contactId not in activeContacts activeContacts.add(contactId) contact = dir.getContact(contactId) contactsToEmails[contactId] = contact['email'] name = " ".join( filter(None, [ contact.get('title_before_name'), contact.get('first_name'), contact.get('last_name'), contact.get('title_after_name') ])) #name = name.translate(str.maketrans('', '', string.punctuation)) name = name.translate(str.maketrans('', '', '@')) name = HumanName(name.lower().strip()) name.capitalize() if re.search('^(\w\.)+$', name.first): name.first = name.first.upper() contactsToNames[contactId] = name.__str__()
def person_name_from_xml(self, ell): '''Create a person mane from an XML element.''' hname = HumanName() for elm in ell: if elm.tag == 'First': hname.first = elm.text elif elm.tag == 'Middle': hname.middle = elm.text elif elm.tag == 'Last': hname.last = elm.text elif elm.tag == 'Title': hname.title = elm.text elif elm.tag == 'Suffix': hname.suffix = elm.text elif elm.tag == 'NickName': hname.nickname = elm.text else: pass return hname
page = url.read() soup = BeautifulSoup(page, 'html.parser') row_box = soup.find_all('tr', attrs={'role': 'row'}) names = {} for i in row_box: if i.td is not None: name = i.td.find('a').text year = i.find('td', attrs={'role': 'rowheader'}) if year is not None: year = year.text else: year = "0" # Process the name, adding dots to middle names if needed. name = HumanName(name) if len(name.first) == 1: name.first = name.first + "." if name.middle is not "": if len(name.middle) == 1: name.middle += '.' names[name.first + ' ' + name.middle + ' ' + name.last] = year else: names[name.first + ' ' + name.last] = year # Write out a csv. with open('acm-fellows.csv', 'w', newline='') as csvfile: fieldnames = ['name', 'year'] wr = csv.DictWriter(csvfile, fieldnames=fieldnames) wr.writeheader() for n in names: wr.writerow({'name': n, 'year': names[n]})
# let nameparser parse parsed = HumanName( name ) # look at how that turned out: print( "Parsed HumanName for " + name + ":" ) print( Person.HumanName_to_str( parsed ) ) # now, make a second HumanName instance. manual = HumanName() # look at how that turned out: print( "Empty HumanName?:" ) print( Person.HumanName_to_str( manual ) ) # override parsed values with correct name parts manual.first = "Van" manual.last = "Conway" # look at how that turned out: print( "after manual configuration:" ) print( Person.HumanName_to_str( manual ) ) # now, try some lookups # let the lookup parse the name. test1 = Person.look_up_person_from_name( name ) print( "test1 = " + str( test1 ) ) # pass in manually configured HumanName test2 = Person.look_up_person_from_name( name, manual ) print( "test2 = " + str( test2 ) )
for comment in subreddit.stream.comments( skip_existing=True ): # Watch the comment stream on our subreddit of choice if KEYPHRASE in comment.body: tableBase = "GP|PTS|REB|AST|STL|BLK|TOV|3PM|FG%|FT%\n:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--|:--\n" # We want to format our response neatly, this is just the formatting convention Reddit uses to create tables. N = 0 # N represents the number of games to include in our averages. N = 0 will default to pulling averages for the entire season so far. player = comment.body.replace( KEYPHRASE, '' ) # Get rid of the keyphrase leaving us with the players name and optionally an N value player = HumanName(player.translate(str.maketrans('', '', "!?.,'-"))) if (player.first.isdigit() == True): # If the user entered a number N = player.first player.first = player.middle if player.suffix == '': # Player did not enter a suffic (Jr, III etc) playerID = findPlayer(obj, player.first, player.last) else: playerID = findPlayer(obj, player.first, player.last + " " + player.suffix) if playerID != None: URL = 'https://stats.nba.com/stats/playerdashboardbylastngames/?measureType=Base&perMode=PerGame&plusMinus=N&paceAdjust=N&rank=N&leagueId=00&season=2019-20&seasonType=Regular+Season&poRound=0&playerId=' + str( playerID ) + '&outcome=&location=&month=0&seasonSegment=&dateFrom=&dateTo=&opponentTeamId=0&vsConference=&vsDivision=&gameSegment=&period=0&shotClockRange=&lastNGames=' + str( N) r = requests.get(url=URL, headers=request_headers) data = r.json()
def to_HumanName( self ): ''' This method creates a nameparser HumanName() object instance for the Person name property values in this instance. Returns the HumanName instance. preconditions: None. postconditions: None. ''' # return reference instance_OUT = None # declare variables me = "to_HumanName" my_name_prefix = "" my_first_name = "" my_middle_name = "" my_last_name = "" my_name_suffix = "" my_nickname = "" my_full_name_string = "" my_lookup_name = "" got_name_parts = False # retrieve values from this instance my_name_prefix = self.get( self.PROP_NAME_NAME_PREFIX, None ) my_first_name = self.get( self.PROP_NAME_FIRST_NAME, None ) my_middle_name = self.get( self.PROP_NAME_MIDDLE_NAME, None ) my_last_name = self.get( self.PROP_NAME_LAST_NAME, None ) my_name_suffix = self.get( self.PROP_NAME_NAME_SUFFIX, None ) my_nickname = self.get( self.PROP_NAME_NICKNAME, None ) my_full_name_string = self.get( self.PROP_NAME_FULL_NAME_STRING, None ) my_lookup_name = self.get_lookup_name() # got name parts? got_name_parts = self.got_name_parts() if ( got_name_parts == True ): # build human name from name parts. instance_OUT = HumanName() # Use nested values to populate HumanName. if ( my_name_prefix ): instance_OUT.title = my_name_prefix #-- END check to see if name_prefix. --# if ( my_first_name ): instance_OUT.first = my_first_name #-- END check to see if first_name. --# if ( my_middle_name ): instance_OUT.middle = my_middle_name #-- END check to see if middle_name. --# if ( my_last_name ): instance_OUT.last = my_last_name #-- END check to see if last_name. --# if ( my_name_suffix ): instance_OUT.suffix = my_name_suffix #-- END check to see if name_suffix. --# if ( my_nickname ): instance_OUT.nickname = my_nickname #-- END check to see if nickname. --# # got full name string? elif ( ( my_full_name_string is not None ) and ( my_full_name_string != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_full_name_string ) # how about lookup name? elif ( ( my_lookup_name is not None ) and ( my_lookup_name != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_lookup_name ) else: # no names present at all. Return None. instance_OUT = None #-- END check to see what name information we have --# return instance_OUT #-- END method to_HumanName() --# #-- END class PersonDetails --#
def parse_persname(persname, auth="", source=""): name, birth_date, death_date = extract_birth_death_dates(persname) birth_date, death_date = validate_dates(birth_date, death_date) dates_string = make_date_string(birth_date, death_date) name = HumanName(name) titles = ["sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte"] numbers = ["II", "III"] title = name.title suffix = name.suffix number = u"" # check if the suffix should actually be a title if not title and any(suffix.lower().strip(". ") == title for title in titles): title = suffix.capitalize() if "mr" in title.lower() and not title.endswith("."): title += "." suffix = u"" # extract numbers from the suffix if suffix in numbers: number = suffix suffix = u"" # special cases cleanup if name.title == u"Royal": name.title = "" title = "" name.middle = name.first if not name.middle else "{} {}".format(u"Royal", name.middle) name.first = u"Royal" if name.title == u"Queen of Great": title = name.title + u" Britain" name.first = u"" if name.title == u"Lama": title = u"Dalai Lama XIV" name.first = u"" name.middle = u"" if name.title == u"Marquis": title = u"" name.first = u"Marquis" name.middle = u"W." if suffix == u"1941": birth_date = suffix suffix = u"" if suffix in [u"18", u"b."]: suffix = u"" if suffix == u"Jr": suffix += u"." if ", fl. 17th cent" in suffix: suffix = u"sieur de" dates_string = u"fl. 17th cent" rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip() if rest_of_name == u"Christella D. Personal journey through South Africa. 1991": rest_of_name = u"Christella D." # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those primary_name = name.last if rest_of_name and not primary_name: primary_name = rest_of_name rest_of_name = "" # create the parsed name dictionary name_parsed = {u"title": unicode(title), u"primary_name": unicode(primary_name), u"rest_of_name": rest_of_name, u"suffix": unicode(suffix), u"fuller_form": unicode(name.nickname), u"numbers": unicode(number), u"birth_date": unicode(birth_date), u"death_date": unicode(death_date), u"date_string": unicode(dates_string), u"authority_id": unicode(auth), u"source": unicode(source), u"name_order": u"inverted", u"sort_name_auto_generate": True} # remove empty fields for key, value in name_parsed.items(): if not value: del name_parsed[key] return name_parsed
def namer(field): #pre if type(field) == tuple: w_name = re.sub( '[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith( "- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith( "& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter): """ Parses a (usually messy) raw name and returns first, middle, last names and a Counter of extracted positions extract_orgs tries to extract organizations from name. defaults to True. only set to False to be able to check if a name is valid (it prevents an infinite loop because by default, extracting organizations is part of the initialization of a person :param name_raw: str :param count: int :param extract_orgs: bool :return: str, str, str, Counter (first name, middle name, last name, positions Counter) """ name_raw = Person.remove_privlog_info(name_raw) # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr' name_raw = Person.remove_jr_sr_iii(name_raw) # position is often attached with a dash, # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = [extracted_position.strip()] else: extracted_positions = [] # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.append(position.strip(',#() ')) name_raw = name_raw.replace(position, '') # Search for known raw_org strings in name_raw, extract them as positions if necessary if extract_orgs: name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw) extracted_positions += new_positions # delete any leftover hashtags name_raw = name_raw.strip(' #') # Delete dashes between last name and initials # DUNN-W -> Dunn W if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] # DUNN-WL -> DUNN WL if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] # Parse current string using HumanName name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 < len(name.first): name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.middle) == 0 and len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.append(name.suffix) # map organization names to clean official names (if they are in the dict) using # RAW_ORG_TO_CLEAN_ORG_DICT clean_orgs = [] for raw_org in extracted_positions: if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT: clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org] if clean_org != '@skip@': clean_orgs.append(clean_org) else: clean_orgs.append(raw_org) extracted_positions = clean_orgs # convert mapped positions into a counter result_positions = Counter() for position in extracted_positions: cleaned = re.sub(r'\.', '', position) result_positions[cleaned.upper()] += count # print(name.first, name.middle, name.last, result_positions) return name.first, name.middle, name.last, result_positions
def human_to_csl(name): """Convert HumanName to CSL-formatted JSON. Args: name : HumanName or str / unicode Returns: CSL-formatted JSON Examples: >>> csl = human_to_csl('Rafael Nadal') >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('Rafael Nadal')) >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('George HW de Bush')) >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'} True >>> csl = human_to_csl('Eisenhower, I') >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'} True >>> csl = human_to_csl('Eisenhower, V') >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'} True """ # Optionally convert to nameparser.HumanName if not isinstance(name, HumanName): name = HumanName(name) # Fix: nameparser treats HumanName('Eisenhower, I') as # {first : 'Eisenhower', suffix : 'I'} if re.search('^[IV]\.*$', name.suffix): name.last = name.first name.first = name.suffix name.suffix = '' # Initialize CSL data csl_data = {} # Append middle name to first if name.middle: name.first += ' ' + name.middle # Iterate over lookup fields for lookup in human_to_csl_map: # Get field and function field = human_to_csl_map[lookup]['field'] fun = human_to_csl_map[lookup].get('fun', I) # Get field from name value = getattr(name, field) # Skip if empty if not value: continue # Apply function value = fun(value) # Save to CSL data csl_data[lookup] = value # Return CSL data return csl_data
def parse_persname(persname, auth="", source=""): name, birth_date, death_date = extract_birth_death_dates(persname) birth_date, death_date = validate_dates(birth_date, death_date) dates_string = make_date_string(birth_date, death_date) name = HumanName(name) titles = [ "sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte" ] numbers = ["II", "III"] title = name.title suffix = name.suffix number = u"" # check if the suffix should actually be a title if not title and any(suffix.lower().strip(". ") == title for title in titles): title = suffix.capitalize() if "mr" in title.lower() and not title.endswith("."): title += "." suffix = u"" # extract numbers from the suffix if suffix in numbers: number = suffix suffix = u"" # special cases cleanup if name.title == u"Royal": name.title = "" title = "" name.middle = name.first if not name.middle else "{} {}".format( u"Royal", name.middle) name.first = u"Royal" if name.title == u"Queen of Great": title = name.title + u" Britain" name.first = u"" if name.title == u"Lama": title = u"Dalai Lama XIV" name.first = u"" name.middle = u"" if name.title == u"Marquis": title = u"" name.first = u"Marquis" name.middle = u"W." if suffix == u"1941": birth_date = suffix suffix = u"" if suffix in [u"18", u"b."]: suffix = u"" if suffix == u"Jr": suffix += u"." if ", fl. 17th cent" in suffix: suffix = u"sieur de" dates_string = u"fl. 17th cent" rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip() if rest_of_name == u"Christella D. Personal journey through South Africa. 1991": rest_of_name = u"Christella D." # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those primary_name = name.last if rest_of_name and not primary_name: primary_name = rest_of_name rest_of_name = "" # create the parsed name dictionary name_parsed = { u"title": unicode(title), u"primary_name": unicode(primary_name), u"rest_of_name": rest_of_name, u"suffix": unicode(suffix), u"fuller_form": unicode(name.nickname), u"numbers": unicode(number), u"birth_date": unicode(birth_date), u"death_date": unicode(death_date), u"date_string": unicode(dates_string), u"authority_id": unicode(auth), u"source": unicode(source), u"name_order": u"inverted", u"sort_name_auto_generate": True } # remove empty fields for key, value in name_parsed.items(): if not value: del name_parsed[key] return name_parsed
def namer(field): #pre if type(field) == tuple: w_name = re.sub('[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith("- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith("& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def normalize_author_name(author): """Normalize author name. :param author: author name :type author: string :return name: the name of the author normilized """ constants = Constants() roman_numeral_suffixes = [ u'v', u'vi', u'vii', u'viii', u'ix', u'x', u'xii', u'xiii', u'xiv', u'xv' ] titles = [ u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', u'Mrs', u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs' ] constants.titles.remove(*constants.titles).add(*titles) constants.suffix_not_acronyms.add(*roman_numeral_suffixes) def _is_initial(author_name): return len(author_name) == 1 or u'.' in author_name def _ensure_dotted_initials(author_name): if _is_initial(author_name)\ and u'.' not in author_name: seq = (author_name, u'.') author_name = u''.join(seq) return author_name def _ensure_dotted_suffixes(author_suffix): if u'.' not in author_suffix: seq = (author_suffix, u'.') author_suffix = u''.join(seq) return author_suffix def _is_roman_numeral(suffix): """Controls that the userinput only contains valid roman numerals""" valid_roman_numerals = [ u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')' ] return all(letters in valid_roman_numerals for letters in suffix.upper()) name = HumanName(author, constants=constants) name.first = _ensure_dotted_initials(name.first) name.middle = _ensure_dotted_initials(name.middle) if _is_initial(name.first) and _is_initial(name.middle): normalized_names = u'{first_name}{middle_name}' else: normalized_names = u'{first_name} {middle_name}' normalized_names = normalized_names.format( first_name=name.first, middle_name=name.middle, ) if _is_roman_numeral(name.suffix): suffix = name.suffix.upper() else: suffix = _ensure_dotted_suffixes(name.suffix) final_name = u', '.join(part for part in (name.last, normalized_names.strip(), suffix) if part) return final_name
def clean_name(dirty_name): name = HumanName(dirty_name.translate(str.maketrans('', '', string.punctuation))) if not name.first: name.first = name.title clean_name = name.first + ' ' + name.last return clean_name
def parse_raw_name(name_raw: str) -> (str, str, str, set): """ Parses a (usually messy) raw name and returns first, middle, last names and a set of extracted positions :param name_raw: str :return: str, str, str, set Parses name and returns as human name >>> n = Person('TEAGUE CE JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('teague, ce jr') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('Teague, Claude Edward, Jr., Ph.D. ') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'Claude', 'Edward', 'JR., PH.D.') >>> n = Person('Teague, J - BAT') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'J', '', 'BAT') >>> n = Person('BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'T', 'E', 'NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n = Person('BAKER-cj') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'C', 'J', '') JR and SR are by default recognized as titles -> turn off through CONSTANTS. >>> n = Person('Baker, JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'J', 'R', '') >>> n = Person('DUNN WL #') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('Dunn, W. L.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('TEMKO SL, COVINGTON AND BURLING') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON AND BURLING') >>> n = Person('Temko, Stanley L [Privlog:] TEMKO,SL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'Stanley', 'L', '') >>> n = Person('Temko-SL, Covington & Burling') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON & BURLING') >>> n = Person('HENSON, A. (AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL') >>> n = Person('HENSON, A. (CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL) (HANDWRITTEN NOTES)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL HANDWRITTEN NOTES') >>> n = Person('Holtzman, A., Murray, J. , Henson, A. , Pepples, E. , Stevens, A. , Witt, S.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtzman', 'A', '', '') >>> n = Person('Holtz, Jacob, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', 'Jacob', '', 'JACOB & MEDINGER') # This one breaks. But I don't think it can be avoided. >>> n = Person('Holtz, Jacob Alexander, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', '', '', 'JACOB ALEXANDER, JACOB & MEDINGER') >>> n = Person('PROCTOR DF, JOHNS HOPKINS SCHOOL OF HYGIENE') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Proctor', 'D', 'F', 'JOHNS HOPKINS SCHOOL OF HYGIENE') """ # remove privlog info, e.g. 'Temko, Stanley L [Privlog:] TEMKO,SL'. It confuses # the name parser privlog_id = name_raw.find('[Privlog:]') if privlog_id == 0: name_raw = name_raw[privlog_id:] elif privlog_id > 0: name_raw = name_raw[:name_raw.find('[Privlog:]')] else: pass # position is often attached with a dash, e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = {extracted_position.strip()} else: extracted_positions = set() # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.add(position.strip(',#() ')) name_raw = name_raw.replace(position, '') institution_regexes = [ # TI/CTR r'[,#] Tobacco Inst.+$', r'[\(\,\#] ?SAB Exec.*$', # American Tobacco r'[(,#] ?American .+$', r'[\(\,\#] ?Amer Brands.*$', r'[,#] American Tob', r'[,#] Atco.*$', # PM r'[\(\,\#] ?Philip Morris.*$', # RJR r'[\(\,\#] ?RJR.*$', # LAW FIRMS r'[\(\,\#] ?Arnold &.*$', r'[\(\,\#] ?Chadbourne.*$', r'[,#] COVINGTON [AB&]*.+$', r'[,#] Foster [&A]*.+$', r'[,#] JACOB [A&]*.+$', r'[\(\,\#] ?Philip Morris.*$', # Universities # match a ( or , or # at the beginning, then some characters that # aren't (,# until the end of the string r'[\(\,\#][^\(\,\#]+ Univ\b.*$', # Univ is fine if it appears at the end of a string (don't want to match in the # middle of a string, e.g "Universal" r'[\(\,\#][^\(\,\#]+ School\b.*$', # Organizations r'[\(\,\#][^\(\,\#]+ Federal Trade Commission.*$', ] for institution in institution_regexes: extracted_institution = re.search(institution, name_raw, re.IGNORECASE) if extracted_institution: extracted_positions.add(extracted_institution.group().strip(',#() ')) name_raw = name_raw[:name_raw.find(extracted_institution.group())] # remove # name_raw = name_raw.strip("#").strip() if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 and len(name.first) > 2: name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.add(name.suffix) return name.first, name.middle, name.last, extracted_positions
return potential_names def match(name1, name2): n1, n2 = HumanName(name1), HumanName(name2) return (any( u(x) == u(y) for x in get_potential_names(n1) for y in get_potential_names(n2))) with open('ap_candidates.csv') as f: reader = csv.DictReader(f) ap_candidates = [row for row in reader] for row in ap_candidates: n = HumanName() n.first = row['first_name'] n.middle = row['middle_name'] n.last = row['last_name'] n.suffix = row['suffix'] row['name'] = str(n) with open('ap_historical_ids.csv') as f: reader = csv.DictReader(f) ap_candidates2 = [row for row in reader] def find(name): for row in ap_candidates: if match(name, row['name']): # print(f'found match for {name} with', row['name']) return int(row['pol_id'])