def parse_name(self, name_raw): """ Parses a raw_name, e.g. "Corbato, F.J." into first, middle, and last name >>> p=Person(name_raw='Corbato, F.J.') >>> p.last, p.first, p.middle ('Corbató', 'F', 'J') :param name_raw: :return: """ # fix names like "Verzuh M., F.", where the middle name comes after the last name # -> it should be Verzuh, F. M. match = re.match('([A-Z][a-z]+) ([A-Z])\., ([A-Z][a-z]*)\.*', name_raw) if match: name_raw = f'{match.groups()[0]}, {match.groups()[2]}. {match.groups()[1]}.' name = HumanName(name_raw) # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match('[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.strip('.').capitalize() name.middle = name.middle.strip('.').capitalize() last_name_replacements = [('Corbato', 'Corbató'), ('Corbatò', 'Corbató'), ('Verguh', 'Verzuh')] for replacement in last_name_replacements: name.last = name.last.replace(replacement[0], replacement[1]) return name.last, name.first, name.middle
def normalize_name(first_name, last_name): """Normalizes capitalization of first and last name.""" name = HumanName() name.first = first_name name.last = last_name name.capitalize() return (name.first, name.last)
def sort_contributor(self, c: ParsedSegment, default_type=None): ''' Sort a contributor into lists based on agent type. ''' name = HumanName(c.name) initials = ''.join(rgx.abbr.findall(c.name)) _type = default_type if name.last in self.contributors['person']: _type = 'person' else: for k, v in self.contributors.items(): if k == 'person': continue if initials in v: if any([x.name == c.name for x in v[initials]]): _type = k if _type is None: i = multi_choice( 'What type of contributor is "{0}"?'.format(c.name), self.contributors.keys()) _type = list(self.contributors.keys())[i] if _type == 'person': if name.last == '': name.last = name.first name.first = '?' if name.first == '': name.first = '?' c.name = name family_name_records = self.contributors[_type].get(name.last, {}) initial_records = family_name_records.get(name.first[0], []) + [c] family_name_records[name.first[0]] = initial_records self.contributors[_type][name.last] = family_name_records else: self.contributors[_type][initials] = self.contributors[_type].get( initials, []) + [c]
def lookup_judge_by_last_name( last_name: str, court_id: str, event_date: Optional[date] = None, ) -> Optional[Person]: """Look up the judge using their last name, a date and court""" hn = HumanName() hn.last = last_name return lookup_judge_by_full_name(hn, court_id, event_date)
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" self.m(hn.last,"de la Vega", hn) hn.title = "test" self.m(hn.title,"test", hn) hn.first = "test" self.m(hn.first,"test", hn) hn.middle = "test" self.m(hn.middle,"test", hn) hn.suffix = "test" self.m(hn.suffix,"test", hn)
def test_assign_list_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.title = ["test1", "test2"] assert hn.title == "test1 test2" hn.first = ["test3", "test4"] assert hn.first == "test3 test4" hn.middle = ["test5", "test6", "test7"] assert hn.middle == "test5 test6 test7" hn.last = ["test8", "test9", "test10"] assert hn.last == "test8 test9 test10" hn.suffix = ["test"] assert hn.suffix == "test"
def _massage_measure_donor_name(self, name_string): """ """ name = HumanName(name_string) name.first = name.first.title() name.last = name.last.title() if name.middle: name.middle = name.middle.replace(".", "") name.middle = "%s." % (name.middle.title()) if name == "JR. Munger CHARLES T.": name.first = "Charles" name.middle = "T." name.last = "Munger" name.suffix = "Jr." if name == "M. Quinn. Delaney": name.first = "M." name.middle = "Quinn" name.last = "Delaney" name.suffix = None if name == "Robert Alan. Eustace": name.first = "Robert" name.middle = "Alan" name.last = "Eustace" name.suffix = None if name == "Susie Tompkins. Buell": name.first = "Susie" name.middle = "Tompkins" name.last = "Buell" name.suffix = None if name.middle and name.suffix: output = "%s %s %s %s" % (name.first, name.middle, name.last, name.suffix) if name.middle: output = "%s %s %s" % (name.first, name.middle, name.last) elif name.suffix: output = "%s %s %s" % (name.first, name.last, name.suffix) else: output = "%s %s" % (name.first, name.last) return output
def from_parts(cls, first=None, last=None, middle=None, suffix=None, title=None): name = HumanName() name.first = first name.middle = middle name.last = last name.suffix = suffix name.title = title return ParsedName(name)
def lookup_judges_by_last_name_list( last_names: List[str], court_id: str, event_date: Optional[date] = None, ) -> List[Person]: """Look up a group of judges by list of last names, a date, and a court""" found_people = [] for last_name in last_names: hn = HumanName() hn.last = last_name person = lookup_judge_by_full_name(hn, court_id, event_date) if person is not None: found_people.append(person) return found_people
def _massage_payload(self, payload): for k, v in payload.items(): if pd.isnull(v) or not v: # Replace nan or None with empty string. payload[k] = "" # Ensure names aren't all caps or all lowercase. if payload.get("firstname") and payload.get("lastname"): name = HumanName() name.first = payload["firstname"] name.last = payload["lastname"] name.capitalize() payload["firstname"] = name.first payload["lastname"] = name.last
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" assert hn.last == "de la Vega" hn.title = "test" assert hn.title == "test" hn.first = "test" assert hn.first == "test" hn.middle = "test" assert hn.middle == "test" hn.suffix = "test" assert hn.suffix == "test" with pytest.raises(TypeError): hn.suffix = [["test"]] with pytest.raises(TypeError): hn.suffix = {"test": "test"}
def HumanNameFmXML(self, ell): hn = HumanName() for el in ell: if el.tag == 'First': hn.first = el.text elif el.tag == 'Middle': hn.middle = el.text elif el.tag == 'Last': hn.last = el.text elif el.tag == 'Title': hn.title = el.text elif el.tag == 'Suffix': hn.suffix = el.text elif el.tag == 'NickName': hn.nickname = el.text else: pass return hn
def person_name_from_xml(self, ell): '''Create a person mane from an XML element.''' hname = HumanName() for elm in ell: if elm.tag == 'First': hname.first = elm.text elif elm.tag == 'Middle': hname.middle = elm.text elif elm.tag == 'Last': hname.last = elm.text elif elm.tag == 'Title': hname.title = elm.text elif elm.tag == 'Suffix': hname.suffix = elm.text elif elm.tag == 'NickName': hname.nickname = elm.text else: pass return hname
def parse_raw_name(name_raw: str) -> (str, str, str, set): """ Parses a (usually messy) raw name and returns first, middle, last names and a set of extracted positions :param name_raw: str :return: str, str, str, set Parses name and returns as human name >>> n = Person('TEAGUE CE JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('teague, ce jr') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('Teague, Claude Edward, Jr., Ph.D. ') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'Claude', 'Edward', 'JR., PH.D.') >>> n = Person('Teague, J - BAT') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'J', '', 'BAT') >>> n = Person('BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'T', 'E', 'NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n = Person('BAKER-cj') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'C', 'J', '') JR and SR are by default recognized as titles -> turn off through CONSTANTS. >>> n = Person('Baker, JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'J', 'R', '') >>> n = Person('DUNN WL #') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('Dunn, W. L.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('TEMKO SL, COVINGTON AND BURLING') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON AND BURLING') >>> n = Person('Temko, Stanley L [Privlog:] TEMKO,SL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'Stanley', 'L', '') >>> n = Person('Temko-SL, Covington & Burling') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON & BURLING') >>> n = Person('HENSON, A. (AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL') >>> n = Person('HENSON, A. (CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL) (HANDWRITTEN NOTES)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL HANDWRITTEN NOTES') >>> n = Person('Holtzman, A., Murray, J. , Henson, A. , Pepples, E. , Stevens, A. , Witt, S.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtzman', 'A', '', '') >>> n = Person('Holtz, Jacob, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', 'Jacob', '', 'JACOB & MEDINGER') # This one breaks. But I don't think it can be avoided. >>> n = Person('Holtz, Jacob Alexander, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', '', '', 'JACOB ALEXANDER, JACOB & MEDINGER') >>> n = Person('PROCTOR DF, JOHNS HOPKINS SCHOOL OF HYGIENE') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Proctor', 'D', 'F', 'JOHNS HOPKINS SCHOOL OF HYGIENE') """ # remove privlog info, e.g. 'Temko, Stanley L [Privlog:] TEMKO,SL'. It confuses # the name parser privlog_id = name_raw.find('[Privlog:]') if privlog_id == 0: name_raw = name_raw[privlog_id:] elif privlog_id > 0: name_raw = name_raw[:name_raw.find('[Privlog:]')] else: pass # position is often attached with a dash, e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = {extracted_position.strip()} else: extracted_positions = set() # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.add(position.strip(',#() ')) name_raw = name_raw.replace(position, '') institution_regexes = [ # TI/CTR r'[,#] Tobacco Inst.+$', r'[\(\,\#] ?SAB Exec.*$', # American Tobacco r'[(,#] ?American .+$', r'[\(\,\#] ?Amer Brands.*$', r'[,#] American Tob', r'[,#] Atco.*$', # PM r'[\(\,\#] ?Philip Morris.*$', # RJR r'[\(\,\#] ?RJR.*$', # LAW FIRMS r'[\(\,\#] ?Arnold &.*$', r'[\(\,\#] ?Chadbourne.*$', r'[,#] COVINGTON [AB&]*.+$', r'[,#] Foster [&A]*.+$', r'[,#] JACOB [A&]*.+$', r'[\(\,\#] ?Philip Morris.*$', # Universities # match a ( or , or # at the beginning, then some characters that # aren't (,# until the end of the string r'[\(\,\#][^\(\,\#]+ Univ\b.*$', # Univ is fine if it appears at the end of a string (don't want to match in the # middle of a string, e.g "Universal" r'[\(\,\#][^\(\,\#]+ School\b.*$', # Organizations r'[\(\,\#][^\(\,\#]+ Federal Trade Commission.*$', ] for institution in institution_regexes: extracted_institution = re.search(institution, name_raw, re.IGNORECASE) if extracted_institution: extracted_positions.add(extracted_institution.group().strip(',#() ')) name_raw = name_raw[:name_raw.find(extracted_institution.group())] # remove # name_raw = name_raw.strip("#").strip() if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 and len(name.first) > 2: name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.add(name.suffix) return name.first, name.middle, name.last, extracted_positions
parsed = HumanName( name ) # look at how that turned out: print( "Parsed HumanName for " + name + ":" ) print( Person.HumanName_to_str( parsed ) ) # now, make a second HumanName instance. manual = HumanName() # look at how that turned out: print( "Empty HumanName?:" ) print( Person.HumanName_to_str( manual ) ) # override parsed values with correct name parts manual.first = "Van" manual.last = "Conway" # look at how that turned out: print( "after manual configuration:" ) print( Person.HumanName_to_str( manual ) ) # now, try some lookups # let the lookup parse the name. test1 = Person.look_up_person_from_name( name ) print( "test1 = " + str( test1 ) ) # pass in manually configured HumanName test2 = Person.look_up_person_from_name( name, manual ) print( "test2 = " + str( test2 ) )
def import_player_data(team, roster_soup): #Determines if the team is the home or away team for the game if team.is_home(): homeaway = "home" game_id = team.home.all()[0].game_id elif team.is_away(): homeaway = "away" game_id = team.away.all()[0].game_id else: raise (Exception("TeamGame object (%s) has no home or away status") % team) ##GET ROSTER #Indicates the list index of "roster" from which to begin iterating START_ROSTER = 3 #Indices of player data within each player's 'roster' tags POSITION_INDEX = 3 NAME_INDEX = 5 NUMBER_INDEX = 1 #number is a value to determine which index of the BeautifulSoup object to extract data from if homeaway == "home": number = 1 else: number = 0 #Sets 'roster' to a BeautifulSoup object which contains the data for a team's roster roster = roster_soup.find_all("td", text="#")[number].parent.parent #Iterates over the 'roster', skipping every other item (null tags) for i in range(START_ROSTER, len(roster), 2): position = roster.contents[i].contents[POSITION_INDEX].text.encode( 'utf-8') name = roster.contents[i].contents[NAME_INDEX].text.encode('utf-8') hname = HumanName(name) first_name = hname.first last_name = hname.last number = roster.contents[i].contents[NUMBER_INDEX].text.encode('utf-8') #Checks if player exists or if multiple players with this name. #If player does not exist, add to Player database and PlayerGame database. #If player exists, add to PlayerGame database. try: p = Player.objects.get(last_name=last_name, team=team.team, number=number) PlayerGame.objects.create(player=p, team=team) except Player.MultipleObjectsReturned: #Check if multiple players in the league or if player has been traded print "Multiple players returned" pass except Player.DoesNotExist: #Check if new player or data error p = Player.objects.create(first_name=first_name, last_name=last_name, position=position, number=number, team=team.team) PlayerGame.objects.create(player=p, team=team) #print "Added - " + p.first_name + " " + p.last_name + " (" + p.position + ") " + team.team.initials ###GET SHIFTS if homeaway == "home": shift_soup = get_soup(game_id, 'shifts_home') else: shift_soup = get_soup(game_id, 'shifts_away') #Get list of players from BeautifulSoup object players = shift_soup.find_all('td', class_='playerHeading') for player in players: playerName = HumanName(player.text) #Strip out player's number from name split_string = string.split(playerName.last, " ", 1) number = split_string[0] playerName.last = split_string[1] playerObj = Player.objects.get(last_name=playerName.last, team=team.team, number=number) #Create new object for PlayerGame and save to database playerGameObj = PlayerGame.objects.get(player=playerObj, team=team) #Parse the individual player BeautifulSoup object for shift data shift = player.parent.next_sibling.next_sibling.next_sibling.next_sibling shift_list = shift.find_all("td") #If 6 items in shift_list object, it matches the structure of valid shift data while (len(shift_list) == 6): period = shift_list[1].text if period == "OT": period = 4 else: period = int(period) timeOn = deleteAfter(shift_list[2].text, '/') timeOn = convertToSecs(timeOn, period) timeOff = deleteAfter(shift_list[3].text, '/') timeOff = convertToSecs(timeOff, period) #this is to prevent potential source data issues where the end of a period is mistakenly set to 0:00 if timeOff == (period - 1) * 1200 and timeOn > (period - 1) * 1200: timeOff = period * 1200 ShiftGame.objects.create(playergame=playerGameObj, start_time=timeOn, end_time=timeOff) #Get next shift for this player shift = shift.next_sibling.next_sibling shift_list = shift.find_all("td") #don't need to include goalies in line combinations ShiftGame.objects.filter(playergame__player__position='G').delete() return True
def import_player_data(team, roster_soup): # Determines if the team is the home or away team for the game if team.is_home(): homeaway = "home" game_id = team.home.all()[0].game_id elif team.is_away(): homeaway = "away" game_id = team.away.all()[0].game_id else: raise (Exception("TeamGame object (%s) has no home or away status") % team) ##GET ROSTER # Indicates the list index of "roster" from which to begin iterating START_ROSTER = 3 # Indices of player data within each player's 'roster' tags POSITION_INDEX = 3 NAME_INDEX = 5 NUMBER_INDEX = 1 # number is a value to determine which index of the BeautifulSoup object to extract data from if homeaway == "home": number = 1 else: number = 0 # Sets 'roster' to a BeautifulSoup object which contains the data for a team's roster roster = roster_soup.find_all("td", text="#")[number].parent.parent # Iterates over the 'roster', skipping every other item (null tags) for i in range(START_ROSTER, len(roster), 2): position = roster.contents[i].contents[POSITION_INDEX].text.encode("utf-8") name = roster.contents[i].contents[NAME_INDEX].text.encode("utf-8") hname = HumanName(name) first_name = hname.first last_name = hname.last number = roster.contents[i].contents[NUMBER_INDEX].text.encode("utf-8") # Checks if player exists or if multiple players with this name. # If player does not exist, add to Player database and PlayerGame database. # If player exists, add to PlayerGame database. try: p = Player.objects.get(last_name=last_name, team=team.team, number=number) PlayerGame.objects.create(player=p, team=team) except Player.MultipleObjectsReturned: # Check if multiple players in the league or if player has been traded print "Multiple players returned" pass except Player.DoesNotExist: # Check if new player or data error p = Player.objects.create( first_name=first_name, last_name=last_name, position=position, number=number, team=team.team ) PlayerGame.objects.create(player=p, team=team) # print "Added - " + p.first_name + " " + p.last_name + " (" + p.position + ") " + team.team.initials ###GET SHIFTS if homeaway == "home": shift_soup = get_soup(game_id, "shifts_home") else: shift_soup = get_soup(game_id, "shifts_away") # Get list of players from BeautifulSoup object players = shift_soup.find_all("td", class_="playerHeading") for player in players: playerName = HumanName(player.text) # Strip out player's number from name split_string = string.split(playerName.last, " ", 1) number = split_string[0] playerName.last = split_string[1] playerObj = Player.objects.get(last_name=playerName.last, team=team.team, number=number) # Create new object for PlayerGame and save to database playerGameObj = PlayerGame.objects.get(player=playerObj, team=team) # Parse the individual player BeautifulSoup object for shift data shift = player.parent.next_sibling.next_sibling.next_sibling.next_sibling shift_list = shift.find_all("td") # If 6 items in shift_list object, it matches the structure of valid shift data while len(shift_list) == 6: period = shift_list[1].text if period == "OT": period = 4 else: period = int(period) timeOn = deleteAfter(shift_list[2].text, "/") timeOn = convertToSecs(timeOn, period) timeOff = deleteAfter(shift_list[3].text, "/") timeOff = convertToSecs(timeOff, period) # this is to prevent potential source data issues where the end of a period is mistakenly set to 0:00 if timeOff == (period - 1) * 1200 and timeOn > (period - 1) * 1200: timeOff = period * 1200 ShiftGame.objects.create(playergame=playerGameObj, start_time=timeOn, end_time=timeOff) # Get next shift for this player shift = shift.next_sibling.next_sibling shift_list = shift.find_all("td") # don't need to include goalies in line combinations ShiftGame.objects.filter(playergame__player__position="G").delete() return True
def to_HumanName( self ): ''' This method creates a nameparser HumanName() object instance for the Person name property values in this instance. Returns the HumanName instance. preconditions: None. postconditions: None. ''' # return reference instance_OUT = None # declare variables me = "to_HumanName" my_name_prefix = "" my_first_name = "" my_middle_name = "" my_last_name = "" my_name_suffix = "" my_nickname = "" my_full_name_string = "" my_lookup_name = "" got_name_parts = False # retrieve values from this instance my_name_prefix = self.get( self.PROP_NAME_NAME_PREFIX, None ) my_first_name = self.get( self.PROP_NAME_FIRST_NAME, None ) my_middle_name = self.get( self.PROP_NAME_MIDDLE_NAME, None ) my_last_name = self.get( self.PROP_NAME_LAST_NAME, None ) my_name_suffix = self.get( self.PROP_NAME_NAME_SUFFIX, None ) my_nickname = self.get( self.PROP_NAME_NICKNAME, None ) my_full_name_string = self.get( self.PROP_NAME_FULL_NAME_STRING, None ) my_lookup_name = self.get_lookup_name() # got name parts? got_name_parts = self.got_name_parts() if ( got_name_parts == True ): # build human name from name parts. instance_OUT = HumanName() # Use nested values to populate HumanName. if ( my_name_prefix ): instance_OUT.title = my_name_prefix #-- END check to see if name_prefix. --# if ( my_first_name ): instance_OUT.first = my_first_name #-- END check to see if first_name. --# if ( my_middle_name ): instance_OUT.middle = my_middle_name #-- END check to see if middle_name. --# if ( my_last_name ): instance_OUT.last = my_last_name #-- END check to see if last_name. --# if ( my_name_suffix ): instance_OUT.suffix = my_name_suffix #-- END check to see if name_suffix. --# if ( my_nickname ): instance_OUT.nickname = my_nickname #-- END check to see if nickname. --# # got full name string? elif ( ( my_full_name_string is not None ) and ( my_full_name_string != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_full_name_string ) # how about lookup name? elif ( ( my_lookup_name is not None ) and ( my_lookup_name != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_lookup_name ) else: # no names present at all. Return None. instance_OUT = None #-- END check to see what name information we have --# return instance_OUT #-- END method to_HumanName() --# #-- END class PersonDetails --#
def namer(field): #pre if type(field) == tuple: w_name = re.sub( '[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith( "- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith( "& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter): """ Parses a (usually messy) raw name and returns first, middle, last names and a Counter of extracted positions extract_orgs tries to extract organizations from name. defaults to True. only set to False to be able to check if a name is valid (it prevents an infinite loop because by default, extracting organizations is part of the initialization of a person :param name_raw: str :param count: int :param extract_orgs: bool :return: str, str, str, Counter (first name, middle name, last name, positions Counter) """ name_raw = Person.remove_privlog_info(name_raw) # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr' name_raw = Person.remove_jr_sr_iii(name_raw) # position is often attached with a dash, # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = [extracted_position.strip()] else: extracted_positions = [] # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.append(position.strip(',#() ')) name_raw = name_raw.replace(position, '') # Search for known raw_org strings in name_raw, extract them as positions if necessary if extract_orgs: name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw) extracted_positions += new_positions # delete any leftover hashtags name_raw = name_raw.strip(' #') # Delete dashes between last name and initials # DUNN-W -> Dunn W if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] # DUNN-WL -> DUNN WL if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] # Parse current string using HumanName name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 < len(name.first): name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.middle) == 0 and len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.append(name.suffix) # map organization names to clean official names (if they are in the dict) using # RAW_ORG_TO_CLEAN_ORG_DICT clean_orgs = [] for raw_org in extracted_positions: if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT: clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org] if clean_org != '@skip@': clean_orgs.append(clean_org) else: clean_orgs.append(raw_org) extracted_positions = clean_orgs # convert mapped positions into a counter result_positions = Counter() for position in extracted_positions: cleaned = re.sub(r'\.', '', position) result_positions[cleaned.upper()] += count # print(name.first, name.middle, name.last, result_positions) return name.first, name.middle, name.last, result_positions
def human_to_csl(name): """Convert HumanName to CSL-formatted JSON. Args: name : HumanName or str / unicode Returns: CSL-formatted JSON Examples: >>> csl = human_to_csl('Rafael Nadal') >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('Rafael Nadal')) >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('George HW de Bush')) >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'} True >>> csl = human_to_csl('Eisenhower, I') >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'} True >>> csl = human_to_csl('Eisenhower, V') >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'} True """ # Optionally convert to nameparser.HumanName if not isinstance(name, HumanName): name = HumanName(name) # Fix: nameparser treats HumanName('Eisenhower, I') as # {first : 'Eisenhower', suffix : 'I'} if re.search('^[IV]\.*$', name.suffix): name.last = name.first name.first = name.suffix name.suffix = '' # Initialize CSL data csl_data = {} # Append middle name to first if name.middle: name.first += ' ' + name.middle # Iterate over lookup fields for lookup in human_to_csl_map: # Get field and function field = human_to_csl_map[lookup]['field'] fun = human_to_csl_map[lookup].get('fun', I) # Get field from name value = getattr(name, field) # Skip if empty if not value: continue # Apply function value = fun(value) # Save to CSL data csl_data[lookup] = value # Return CSL data return csl_data
def namer(field): #pre if type(field) == tuple: w_name = re.sub('[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith("- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith("& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def match(name1, name2): n1, n2 = HumanName(name1), HumanName(name2) return (any( u(x) == u(y) for x in get_potential_names(n1) for y in get_potential_names(n2))) with open('ap_candidates.csv') as f: reader = csv.DictReader(f) ap_candidates = [row for row in reader] for row in ap_candidates: n = HumanName() n.first = row['first_name'] n.middle = row['middle_name'] n.last = row['last_name'] n.suffix = row['suffix'] row['name'] = str(n) with open('ap_historical_ids.csv') as f: reader = csv.DictReader(f) ap_candidates2 = [row for row in reader] def find(name): for row in ap_candidates: if match(name, row['name']): # print(f'found match for {name} with', row['name']) return int(row['pol_id'])