def parse_name(self, name_raw): """ Parses a raw_name, e.g. "Corbato, F.J." into first, middle, and last name >>> p=Person(name_raw='Corbato, F.J.') >>> p.last, p.first, p.middle ('Corbató', 'F', 'J') :param name_raw: :return: """ # fix names like "Verzuh M., F.", where the middle name comes after the last name # -> it should be Verzuh, F. M. match = re.match('([A-Z][a-z]+) ([A-Z])\., ([A-Z][a-z]*)\.*', name_raw) if match: name_raw = f'{match.groups()[0]}, {match.groups()[2]}. {match.groups()[1]}.' name = HumanName(name_raw) # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match('[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.strip('.').capitalize() name.middle = name.middle.strip('.').capitalize() last_name_replacements = [('Corbato', 'Corbató'), ('Corbatò', 'Corbató'), ('Verguh', 'Verzuh')] for replacement in last_name_replacements: name.last = name.last.replace(replacement[0], replacement[1]) return name.last, name.first, name.middle
def test_formating_removing_pieces_from_name_buckets(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'" hn.string_format = "{title} {first} {middle} {last} {suffix}" assert u(hn) == "Rev John A. Kenneth Doe III" hn.middle = "" assert u(hn) == "Rev John Doe III" hn.suffix = "" assert u(hn) == "Rev John Doe" hn.title = "" assert u(hn) == "John Doe"
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" self.m(hn.last,"de la Vega", hn) hn.title = "test" self.m(hn.title,"test", hn) hn.first = "test" self.m(hn.first,"test", hn) hn.middle = "test" self.m(hn.middle,"test", hn) hn.suffix = "test" self.m(hn.suffix,"test", hn)
def test_assign_list_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.title = ["test1", "test2"] assert hn.title == "test1 test2" hn.first = ["test3", "test4"] assert hn.first == "test3 test4" hn.middle = ["test5", "test6", "test7"] assert hn.middle == "test5 test6 test7" hn.last = ["test8", "test9", "test10"] assert hn.last == "test8 test9 test10" hn.suffix = ["test"] assert hn.suffix == "test"
def from_parts(cls, first=None, last=None, middle=None, suffix=None, title=None): name = HumanName() name.first = first name.middle = middle name.last = last name.suffix = suffix name.title = title return ParsedName(name)
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" assert hn.last == "de la Vega" hn.title = "test" assert hn.title == "test" hn.first = "test" assert hn.first == "test" hn.middle = "test" assert hn.middle == "test" hn.suffix = "test" assert hn.suffix == "test" with pytest.raises(TypeError): hn.suffix = [["test"]] with pytest.raises(TypeError): hn.suffix = {"test": "test"}
def HumanNameFmXML(self, ell): hn = HumanName() for el in ell: if el.tag == 'First': hn.first = el.text elif el.tag == 'Middle': hn.middle = el.text elif el.tag == 'Last': hn.last = el.text elif el.tag == 'Title': hn.title = el.text elif el.tag == 'Suffix': hn.suffix = el.text elif el.tag == 'NickName': hn.nickname = el.text else: pass return hn
def person_name_from_xml(self, ell): '''Create a person mane from an XML element.''' hname = HumanName() for elm in ell: if elm.tag == 'First': hname.first = elm.text elif elm.tag == 'Middle': hname.middle = elm.text elif elm.tag == 'Last': hname.last = elm.text elif elm.tag == 'Title': hname.title = elm.text elif elm.tag == 'Suffix': hname.suffix = elm.text elif elm.tag == 'NickName': hname.nickname = elm.text else: pass return hname
def _massage_measure_donor_name(self, name_string): """ """ name = HumanName(name_string) name.first = name.first.title() name.last = name.last.title() if name.middle: name.middle = name.middle.replace(".", "") name.middle = "%s." % (name.middle.title()) if name == "JR. Munger CHARLES T.": name.first = "Charles" name.middle = "T." name.last = "Munger" name.suffix = "Jr." if name == "M. Quinn. Delaney": name.first = "M." name.middle = "Quinn" name.last = "Delaney" name.suffix = None if name == "Robert Alan. Eustace": name.first = "Robert" name.middle = "Alan" name.last = "Eustace" name.suffix = None if name == "Susie Tompkins. Buell": name.first = "Susie" name.middle = "Tompkins" name.last = "Buell" name.suffix = None if name.middle and name.suffix: output = "%s %s %s %s" % (name.first, name.middle, name.last, name.suffix) if name.middle: output = "%s %s %s" % (name.first, name.middle, name.last) elif name.suffix: output = "%s %s %s" % (name.first, name.last, name.suffix) else: output = "%s %s" % (name.first, name.last) return output
def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter): """ Parses a (usually messy) raw name and returns first, middle, last names and a Counter of extracted positions extract_orgs tries to extract organizations from name. defaults to True. only set to False to be able to check if a name is valid (it prevents an infinite loop because by default, extracting organizations is part of the initialization of a person :param name_raw: str :param count: int :param extract_orgs: bool :return: str, str, str, Counter (first name, middle name, last name, positions Counter) """ name_raw = Person.remove_privlog_info(name_raw) # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr' name_raw = Person.remove_jr_sr_iii(name_raw) # position is often attached with a dash, # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = [extracted_position.strip()] else: extracted_positions = [] # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.append(position.strip(',#() ')) name_raw = name_raw.replace(position, '') # Search for known raw_org strings in name_raw, extract them as positions if necessary if extract_orgs: name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw) extracted_positions += new_positions # delete any leftover hashtags name_raw = name_raw.strip(' #') # Delete dashes between last name and initials # DUNN-W -> Dunn W if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] # DUNN-WL -> DUNN WL if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] # Parse current string using HumanName name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 < len(name.first): name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.middle) == 0 and len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.append(name.suffix) # map organization names to clean official names (if they are in the dict) using # RAW_ORG_TO_CLEAN_ORG_DICT clean_orgs = [] for raw_org in extracted_positions: if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT: clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org] if clean_org != '@skip@': clean_orgs.append(clean_org) else: clean_orgs.append(raw_org) extracted_positions = clean_orgs # convert mapped positions into a counter result_positions = Counter() for position in extracted_positions: cleaned = re.sub(r'\.', '', position) result_positions[cleaned.upper()] += count # print(name.first, name.middle, name.last, result_positions) return name.first, name.middle, name.last, result_positions
def parse_raw_name(name_raw: str) -> (str, str, str, set): """ Parses a (usually messy) raw name and returns first, middle, last names and a set of extracted positions :param name_raw: str :return: str, str, str, set Parses name and returns as human name >>> n = Person('TEAGUE CE JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('teague, ce jr') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('Teague, Claude Edward, Jr., Ph.D. ') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'Claude', 'Edward', 'JR., PH.D.') >>> n = Person('Teague, J - BAT') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'J', '', 'BAT') >>> n = Person('BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'T', 'E', 'NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n = Person('BAKER-cj') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'C', 'J', '') JR and SR are by default recognized as titles -> turn off through CONSTANTS. >>> n = Person('Baker, JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'J', 'R', '') >>> n = Person('DUNN WL #') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('Dunn, W. L.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('TEMKO SL, COVINGTON AND BURLING') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON AND BURLING') >>> n = Person('Temko, Stanley L [Privlog:] TEMKO,SL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'Stanley', 'L', '') >>> n = Person('Temko-SL, Covington & Burling') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON & BURLING') >>> n = Person('HENSON, A. (AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL') >>> n = Person('HENSON, A. (CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL) (HANDWRITTEN NOTES)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL HANDWRITTEN NOTES') >>> n = Person('Holtzman, A., Murray, J. , Henson, A. , Pepples, E. , Stevens, A. , Witt, S.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtzman', 'A', '', '') >>> n = Person('Holtz, Jacob, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', 'Jacob', '', 'JACOB & MEDINGER') # This one breaks. But I don't think it can be avoided. >>> n = Person('Holtz, Jacob Alexander, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', '', '', 'JACOB ALEXANDER, JACOB & MEDINGER') >>> n = Person('PROCTOR DF, JOHNS HOPKINS SCHOOL OF HYGIENE') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Proctor', 'D', 'F', 'JOHNS HOPKINS SCHOOL OF HYGIENE') """ # remove privlog info, e.g. 'Temko, Stanley L [Privlog:] TEMKO,SL'. It confuses # the name parser privlog_id = name_raw.find('[Privlog:]') if privlog_id == 0: name_raw = name_raw[privlog_id:] elif privlog_id > 0: name_raw = name_raw[:name_raw.find('[Privlog:]')] else: pass # position is often attached with a dash, e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = {extracted_position.strip()} else: extracted_positions = set() # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.add(position.strip(',#() ')) name_raw = name_raw.replace(position, '') institution_regexes = [ # TI/CTR r'[,#] Tobacco Inst.+$', r'[\(\,\#] ?SAB Exec.*$', # American Tobacco r'[(,#] ?American .+$', r'[\(\,\#] ?Amer Brands.*$', r'[,#] American Tob', r'[,#] Atco.*$', # PM r'[\(\,\#] ?Philip Morris.*$', # RJR r'[\(\,\#] ?RJR.*$', # LAW FIRMS r'[\(\,\#] ?Arnold &.*$', r'[\(\,\#] ?Chadbourne.*$', r'[,#] COVINGTON [AB&]*.+$', r'[,#] Foster [&A]*.+$', r'[,#] JACOB [A&]*.+$', r'[\(\,\#] ?Philip Morris.*$', # Universities # match a ( or , or # at the beginning, then some characters that # aren't (,# until the end of the string r'[\(\,\#][^\(\,\#]+ Univ\b.*$', # Univ is fine if it appears at the end of a string (don't want to match in the # middle of a string, e.g "Universal" r'[\(\,\#][^\(\,\#]+ School\b.*$', # Organizations r'[\(\,\#][^\(\,\#]+ Federal Trade Commission.*$', ] for institution in institution_regexes: extracted_institution = re.search(institution, name_raw, re.IGNORECASE) if extracted_institution: extracted_positions.add(extracted_institution.group().strip(',#() ')) name_raw = name_raw[:name_raw.find(extracted_institution.group())] # remove # name_raw = name_raw.strip("#").strip() if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 and len(name.first) > 2: name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.add(name.suffix) return name.first, name.middle, name.last, extracted_positions
def normalize_author_name(author): """Normalize author name. :param author: author name :type author: string :return name: the name of the author normilized """ constants = Constants() roman_numeral_suffixes = [ u'v', u'vi', u'vii', u'viii', u'ix', u'x', u'xii', u'xiii', u'xiv', u'xv' ] titles = [ u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', u'Mrs', u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs' ] constants.titles.remove(*constants.titles).add(*titles) constants.suffix_not_acronyms.add(*roman_numeral_suffixes) def _is_initial(author_name): return len(author_name) == 1 or u'.' in author_name def _ensure_dotted_initials(author_name): if _is_initial(author_name)\ and u'.' not in author_name: seq = (author_name, u'.') author_name = u''.join(seq) return author_name def _ensure_dotted_suffixes(author_suffix): if u'.' not in author_suffix: seq = (author_suffix, u'.') author_suffix = u''.join(seq) return author_suffix def _is_roman_numeral(suffix): """Controls that the userinput only contains valid roman numerals""" valid_roman_numerals = [ u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')' ] return all(letters in valid_roman_numerals for letters in suffix.upper()) name = HumanName(author, constants=constants) name.first = _ensure_dotted_initials(name.first) name.middle = _ensure_dotted_initials(name.middle) if _is_initial(name.first) and _is_initial(name.middle): normalized_names = u'{first_name}{middle_name}' else: normalized_names = u'{first_name} {middle_name}' normalized_names = normalized_names.format( first_name=name.first, middle_name=name.middle, ) if _is_roman_numeral(name.suffix): suffix = name.suffix.upper() else: suffix = _ensure_dotted_suffixes(name.suffix) final_name = u', '.join(part for part in (name.last, normalized_names.strip(), suffix) if part) return final_name
def to_HumanName( self ): ''' This method creates a nameparser HumanName() object instance for the Person name property values in this instance. Returns the HumanName instance. preconditions: None. postconditions: None. ''' # return reference instance_OUT = None # declare variables me = "to_HumanName" my_name_prefix = "" my_first_name = "" my_middle_name = "" my_last_name = "" my_name_suffix = "" my_nickname = "" my_full_name_string = "" my_lookup_name = "" got_name_parts = False # retrieve values from this instance my_name_prefix = self.get( self.PROP_NAME_NAME_PREFIX, None ) my_first_name = self.get( self.PROP_NAME_FIRST_NAME, None ) my_middle_name = self.get( self.PROP_NAME_MIDDLE_NAME, None ) my_last_name = self.get( self.PROP_NAME_LAST_NAME, None ) my_name_suffix = self.get( self.PROP_NAME_NAME_SUFFIX, None ) my_nickname = self.get( self.PROP_NAME_NICKNAME, None ) my_full_name_string = self.get( self.PROP_NAME_FULL_NAME_STRING, None ) my_lookup_name = self.get_lookup_name() # got name parts? got_name_parts = self.got_name_parts() if ( got_name_parts == True ): # build human name from name parts. instance_OUT = HumanName() # Use nested values to populate HumanName. if ( my_name_prefix ): instance_OUT.title = my_name_prefix #-- END check to see if name_prefix. --# if ( my_first_name ): instance_OUT.first = my_first_name #-- END check to see if first_name. --# if ( my_middle_name ): instance_OUT.middle = my_middle_name #-- END check to see if middle_name. --# if ( my_last_name ): instance_OUT.last = my_last_name #-- END check to see if last_name. --# if ( my_name_suffix ): instance_OUT.suffix = my_name_suffix #-- END check to see if name_suffix. --# if ( my_nickname ): instance_OUT.nickname = my_nickname #-- END check to see if nickname. --# # got full name string? elif ( ( my_full_name_string is not None ) and ( my_full_name_string != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_full_name_string ) # how about lookup name? elif ( ( my_lookup_name is not None ) and ( my_lookup_name != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_lookup_name ) else: # no names present at all. Return None. instance_OUT = None #-- END check to see what name information we have --# return instance_OUT #-- END method to_HumanName() --# #-- END class PersonDetails --#
def parse_persname(persname, auth="", source=""): name, birth_date, death_date = extract_birth_death_dates(persname) birth_date, death_date = validate_dates(birth_date, death_date) dates_string = make_date_string(birth_date, death_date) name = HumanName(name) titles = ["sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte"] numbers = ["II", "III"] title = name.title suffix = name.suffix number = u"" # check if the suffix should actually be a title if not title and any(suffix.lower().strip(". ") == title for title in titles): title = suffix.capitalize() if "mr" in title.lower() and not title.endswith("."): title += "." suffix = u"" # extract numbers from the suffix if suffix in numbers: number = suffix suffix = u"" # special cases cleanup if name.title == u"Royal": name.title = "" title = "" name.middle = name.first if not name.middle else "{} {}".format(u"Royal", name.middle) name.first = u"Royal" if name.title == u"Queen of Great": title = name.title + u" Britain" name.first = u"" if name.title == u"Lama": title = u"Dalai Lama XIV" name.first = u"" name.middle = u"" if name.title == u"Marquis": title = u"" name.first = u"Marquis" name.middle = u"W." if suffix == u"1941": birth_date = suffix suffix = u"" if suffix in [u"18", u"b."]: suffix = u"" if suffix == u"Jr": suffix += u"." if ", fl. 17th cent" in suffix: suffix = u"sieur de" dates_string = u"fl. 17th cent" rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip() if rest_of_name == u"Christella D. Personal journey through South Africa. 1991": rest_of_name = u"Christella D." # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those primary_name = name.last if rest_of_name and not primary_name: primary_name = rest_of_name rest_of_name = "" # create the parsed name dictionary name_parsed = {u"title": unicode(title), u"primary_name": unicode(primary_name), u"rest_of_name": rest_of_name, u"suffix": unicode(suffix), u"fuller_form": unicode(name.nickname), u"numbers": unicode(number), u"birth_date": unicode(birth_date), u"death_date": unicode(death_date), u"date_string": unicode(dates_string), u"authority_id": unicode(auth), u"source": unicode(source), u"name_order": u"inverted", u"sort_name_auto_generate": True} # remove empty fields for key, value in name_parsed.items(): if not value: del name_parsed[key] return name_parsed
def parse_persname(persname, auth="", source=""): name, birth_date, death_date = extract_birth_death_dates(persname) birth_date, death_date = validate_dates(birth_date, death_date) dates_string = make_date_string(birth_date, death_date) name = HumanName(name) titles = [ "sir", "mr", "mrs", "baron", "dame", "madame", "viscount", "conte" ] numbers = ["II", "III"] title = name.title suffix = name.suffix number = u"" # check if the suffix should actually be a title if not title and any(suffix.lower().strip(". ") == title for title in titles): title = suffix.capitalize() if "mr" in title.lower() and not title.endswith("."): title += "." suffix = u"" # extract numbers from the suffix if suffix in numbers: number = suffix suffix = u"" # special cases cleanup if name.title == u"Royal": name.title = "" title = "" name.middle = name.first if not name.middle else "{} {}".format( u"Royal", name.middle) name.first = u"Royal" if name.title == u"Queen of Great": title = name.title + u" Britain" name.first = u"" if name.title == u"Lama": title = u"Dalai Lama XIV" name.first = u"" name.middle = u"" if name.title == u"Marquis": title = u"" name.first = u"Marquis" name.middle = u"W." if suffix == u"1941": birth_date = suffix suffix = u"" if suffix in [u"18", u"b."]: suffix = u"" if suffix == u"Jr": suffix += u"." if ", fl. 17th cent" in suffix: suffix = u"sieur de" dates_string = u"fl. 17th cent" rest_of_name = u"{0} {1}".format(name.first, name.middle).rstrip() if rest_of_name == u"Christella D. Personal journey through South Africa. 1991": rest_of_name = u"Christella D." # People with single-part names (like Keewaydinoquay) are mis-assigned. Have to fix those primary_name = name.last if rest_of_name and not primary_name: primary_name = rest_of_name rest_of_name = "" # create the parsed name dictionary name_parsed = { u"title": unicode(title), u"primary_name": unicode(primary_name), u"rest_of_name": rest_of_name, u"suffix": unicode(suffix), u"fuller_form": unicode(name.nickname), u"numbers": unicode(number), u"birth_date": unicode(birth_date), u"death_date": unicode(death_date), u"date_string": unicode(dates_string), u"authority_id": unicode(auth), u"source": unicode(source), u"name_order": u"inverted", u"sort_name_auto_generate": True } # remove empty fields for key, value in name_parsed.items(): if not value: del name_parsed[key] return name_parsed
def namer(field): #pre if type(field) == tuple: w_name = re.sub( '[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith( "- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith( "& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def namer(field): #pre if type(field) == tuple: w_name = re.sub('[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith("- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith("& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def match(name1, name2): n1, n2 = HumanName(name1), HumanName(name2) return (any( u(x) == u(y) for x in get_potential_names(n1) for y in get_potential_names(n2))) with open('ap_candidates.csv') as f: reader = csv.DictReader(f) ap_candidates = [row for row in reader] for row in ap_candidates: n = HumanName() n.first = row['first_name'] n.middle = row['middle_name'] n.last = row['last_name'] n.suffix = row['suffix'] row['name'] = str(n) with open('ap_historical_ids.csv') as f: reader = csv.DictReader(f) ap_candidates2 = [row for row in reader] def find(name): for row in ap_candidates: if match(name, row['name']): # print(f'found match for {name} with', row['name']) return int(row['pol_id'])