def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" assert hn.last == "de la Vega" hn.title = "test" assert hn.title == "test" hn.first = "test" assert hn.first == "test" hn.middle = "test" assert hn.middle == "test" hn.suffix = "test" assert hn.suffix == "test" with pytest.raises(TypeError): hn.suffix = [["test"]] with pytest.raises(TypeError): hn.suffix = {"test": "test"}
def test_assignment_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.last = "de la Vega" self.m(hn.last,"de la Vega", hn) hn.title = "test" self.m(hn.title,"test", hn) hn.first = "test" self.m(hn.first,"test", hn) hn.middle = "test" self.m(hn.middle,"test", hn) hn.suffix = "test" self.m(hn.suffix,"test", hn)
def test_formating_removing_pieces_from_name_buckets(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'" hn.string_format = "{title} {first} {middle} {last} {suffix}" assert u(hn) == "Rev John A. Kenneth Doe III" hn.middle = "" assert u(hn) == "Rev John Doe III" hn.suffix = "" assert u(hn) == "Rev John Doe" hn.title = "" assert u(hn) == "John Doe"
def test_assign_list_to_attribute(self): hn = HumanName("John A. Kenneth Doe, Jr.") hn.title = ["test1", "test2"] assert hn.title == "test1 test2" hn.first = ["test3", "test4"] assert hn.first == "test3 test4" hn.middle = ["test5", "test6", "test7"] assert hn.middle == "test5 test6 test7" hn.last = ["test8", "test9", "test10"] assert hn.last == "test8 test9 test10" hn.suffix = ["test"] assert hn.suffix == "test"
def from_parts(cls, first=None, last=None, middle=None, suffix=None, title=None): name = HumanName() name.first = first name.middle = middle name.last = last name.suffix = suffix name.title = title return ParsedName(name)
def _massage_measure_donor_name(self, name_string): """ """ name = HumanName(name_string) name.first = name.first.title() name.last = name.last.title() if name.middle: name.middle = name.middle.replace(".", "") name.middle = "%s." % (name.middle.title()) if name == "JR. Munger CHARLES T.": name.first = "Charles" name.middle = "T." name.last = "Munger" name.suffix = "Jr." if name == "M. Quinn. Delaney": name.first = "M." name.middle = "Quinn" name.last = "Delaney" name.suffix = None if name == "Robert Alan. Eustace": name.first = "Robert" name.middle = "Alan" name.last = "Eustace" name.suffix = None if name == "Susie Tompkins. Buell": name.first = "Susie" name.middle = "Tompkins" name.last = "Buell" name.suffix = None if name.middle and name.suffix: output = "%s %s %s %s" % (name.first, name.middle, name.last, name.suffix) if name.middle: output = "%s %s %s" % (name.first, name.middle, name.last) elif name.suffix: output = "%s %s %s" % (name.first, name.last, name.suffix) else: output = "%s %s" % (name.first, name.last) return output
def HumanNameFmXML(self, ell): hn = HumanName() for el in ell: if el.tag == 'First': hn.first = el.text elif el.tag == 'Middle': hn.middle = el.text elif el.tag == 'Last': hn.last = el.text elif el.tag == 'Title': hn.title = el.text elif el.tag == 'Suffix': hn.suffix = el.text elif el.tag == 'NickName': hn.nickname = el.text else: pass return hn
def person_name_from_xml(self, ell): '''Create a person mane from an XML element.''' hname = HumanName() for elm in ell: if elm.tag == 'First': hname.first = elm.text elif elm.tag == 'Middle': hname.middle = elm.text elif elm.tag == 'Last': hname.last = elm.text elif elm.tag == 'Title': hname.title = elm.text elif elm.tag == 'Suffix': hname.suffix = elm.text elif elm.tag == 'NickName': hname.nickname = elm.text else: pass return hname
def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter): """ Parses a (usually messy) raw name and returns first, middle, last names and a Counter of extracted positions extract_orgs tries to extract organizations from name. defaults to True. only set to False to be able to check if a name is valid (it prevents an infinite loop because by default, extracting organizations is part of the initialization of a person :param name_raw: str :param count: int :param extract_orgs: bool :return: str, str, str, Counter (first name, middle name, last name, positions Counter) """ name_raw = Person.remove_privlog_info(name_raw) # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr' name_raw = Person.remove_jr_sr_iii(name_raw) # position is often attached with a dash, # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = [extracted_position.strip()] else: extracted_positions = [] # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.append(position.strip(',#() ')) name_raw = name_raw.replace(position, '') # Search for known raw_org strings in name_raw, extract them as positions if necessary if extract_orgs: name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw) extracted_positions += new_positions # delete any leftover hashtags name_raw = name_raw.strip(' #') # Delete dashes between last name and initials # DUNN-W -> Dunn W if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] # DUNN-WL -> DUNN WL if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] # Parse current string using HumanName name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 < len(name.first): name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.middle) == 0 and len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.append(name.suffix) # map organization names to clean official names (if they are in the dict) using # RAW_ORG_TO_CLEAN_ORG_DICT clean_orgs = [] for raw_org in extracted_positions: if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT: clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org] if clean_org != '@skip@': clean_orgs.append(clean_org) else: clean_orgs.append(raw_org) extracted_positions = clean_orgs # convert mapped positions into a counter result_positions = Counter() for position in extracted_positions: cleaned = re.sub(r'\.', '', position) result_positions[cleaned.upper()] += count # print(name.first, name.middle, name.last, result_positions) return name.first, name.middle, name.last, result_positions
def human_to_csl(name): """Convert HumanName to CSL-formatted JSON. Args: name : HumanName or str / unicode Returns: CSL-formatted JSON Examples: >>> csl = human_to_csl('Rafael Nadal') >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('Rafael Nadal')) >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'} True >>> csl = human_to_csl(HumanName('George HW de Bush')) >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'} True >>> csl = human_to_csl('Eisenhower, I') >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'} True >>> csl = human_to_csl('Eisenhower, V') >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'} True """ # Optionally convert to nameparser.HumanName if not isinstance(name, HumanName): name = HumanName(name) # Fix: nameparser treats HumanName('Eisenhower, I') as # {first : 'Eisenhower', suffix : 'I'} if re.search('^[IV]\.*$', name.suffix): name.last = name.first name.first = name.suffix name.suffix = '' # Initialize CSL data csl_data = {} # Append middle name to first if name.middle: name.first += ' ' + name.middle # Iterate over lookup fields for lookup in human_to_csl_map: # Get field and function field = human_to_csl_map[lookup]['field'] fun = human_to_csl_map[lookup].get('fun', I) # Get field from name value = getattr(name, field) # Skip if empty if not value: continue # Apply function value = fun(value) # Save to CSL data csl_data[lookup] = value # Return CSL data return csl_data
def parse_raw_name(name_raw: str) -> (str, str, str, set): """ Parses a (usually messy) raw name and returns first, middle, last names and a set of extracted positions :param name_raw: str :return: str, str, str, set Parses name and returns as human name >>> n = Person('TEAGUE CE JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('teague, ce jr') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'C', 'E', 'JR') >>> n = Person('Teague, Claude Edward, Jr., Ph.D. ') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'Claude', 'Edward', 'JR., PH.D.') >>> n = Person('Teague, J - BAT') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Teague', 'J', '', 'BAT') >>> n = Person('BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'T', 'E', 'NATIONAL ASSOCIATION OF ATTORNEYS GENERAL') >>> n = Person('BAKER-cj') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'C', 'J', '') JR and SR are by default recognized as titles -> turn off through CONSTANTS. >>> n = Person('Baker, JR') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Baker', 'J', 'R', '') >>> n = Person('DUNN WL #') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('Dunn, W. L.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Dunn', 'W', 'L', '') >>> n = Person('TEMKO SL, COVINGTON AND BURLING') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON AND BURLING') >>> n = Person('Temko, Stanley L [Privlog:] TEMKO,SL') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'Stanley', 'L', '') >>> n = Person('Temko-SL, Covington & Burling') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Temko', 'S', 'L', 'COVINGTON & BURLING') >>> n = Person('HENSON, A. (AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL') >>> n = Person('HENSON, A. (CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL) (HANDWRITTEN NOTES)') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Henson', 'A', '', 'CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL HANDWRITTEN NOTES') >>> n = Person('Holtzman, A., Murray, J. , Henson, A. , Pepples, E. , Stevens, A. , Witt, S.') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtzman', 'A', '', '') >>> n = Person('Holtz, Jacob, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', 'Jacob', '', 'JACOB & MEDINGER') # This one breaks. But I don't think it can be avoided. >>> n = Person('Holtz, Jacob Alexander, Jacob & Medinger') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Holtz', '', '', 'JACOB ALEXANDER, JACOB & MEDINGER') >>> n = Person('PROCTOR DF, JOHNS HOPKINS SCHOOL OF HYGIENE') >>> n.last, n.first, n.middle, " ".join(n.positions).upper() ('Proctor', 'D', 'F', 'JOHNS HOPKINS SCHOOL OF HYGIENE') """ # remove privlog info, e.g. 'Temko, Stanley L [Privlog:] TEMKO,SL'. It confuses # the name parser privlog_id = name_raw.find('[Privlog:]') if privlog_id == 0: name_raw = name_raw[privlog_id:] elif privlog_id > 0: name_raw = name_raw[:name_raw.find('[Privlog:]')] else: pass # position is often attached with a dash, e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS' if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2: name_raw, extracted_position = name_raw.split(" - ") extracted_positions = {extracted_position.strip()} else: extracted_positions = set() # extract positions in parens e.g. Henson, A (Chadbourne & Park) paren_positions = re.findall(r'\([^(]+\)', name_raw) for position in paren_positions: extracted_positions.add(position.strip(',#() ')) name_raw = name_raw.replace(position, '') institution_regexes = [ # TI/CTR r'[,#] Tobacco Inst.+$', r'[\(\,\#] ?SAB Exec.*$', # American Tobacco r'[(,#] ?American .+$', r'[\(\,\#] ?Amer Brands.*$', r'[,#] American Tob', r'[,#] Atco.*$', # PM r'[\(\,\#] ?Philip Morris.*$', # RJR r'[\(\,\#] ?RJR.*$', # LAW FIRMS r'[\(\,\#] ?Arnold &.*$', r'[\(\,\#] ?Chadbourne.*$', r'[,#] COVINGTON [AB&]*.+$', r'[,#] Foster [&A]*.+$', r'[,#] JACOB [A&]*.+$', r'[\(\,\#] ?Philip Morris.*$', # Universities # match a ( or , or # at the beginning, then some characters that # aren't (,# until the end of the string r'[\(\,\#][^\(\,\#]+ Univ\b.*$', # Univ is fine if it appears at the end of a string (don't want to match in the # middle of a string, e.g "Universal" r'[\(\,\#][^\(\,\#]+ School\b.*$', # Organizations r'[\(\,\#][^\(\,\#]+ Federal Trade Commission.*$', ] for institution in institution_regexes: extracted_institution = re.search(institution, name_raw, re.IGNORECASE) if extracted_institution: extracted_positions.add(extracted_institution.group().strip(',#() ')) name_raw = name_raw[:name_raw.find(extracted_institution.group())] # remove # name_raw = name_raw.strip("#").strip() if name_raw[-2] == '-': name_raw = name_raw[:-2] + " " + name_raw[-1:] if len(name_raw) > 2 and name_raw[-3] == '-': name_raw = name_raw[:-3] + " " + name_raw[-2:] name = HumanName(name_raw) # e.g. Dunn W -> parsed as last name W. -> switch first/last if len(name.last) <= 2 and len(name.first) > 2: name.first, name.last = name.last, name.first # remove periods from initials if len(name.first) == 2 and name.first[1] == '.': name.first = name.first[0] if len(name.middle) == 2 and name.middle[1] == '.': name.middle = name.middle[0] # If first name is length 2 (Teague, CE), the two letters are most likely initials. if len(name.first) == 2: name.middle = name.first[1].upper() name.first = name.first[0].upper() # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague" if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first): name.middle = name.first[2] name.first = name.first[0] name.last = name.last.capitalize() name.first = name.first.capitalize() name.middle = name.middle.capitalize() # if multiple names are passed, they often end up in the middle name # e.g. 'Holtzman, A., Murray, J. , Henson, A. -> only allow one comma or set to empty if name.middle.count(',') > 1: name.middle = '' if len(name.suffix) > 20 and name.suffix.count('.') > 2: name.suffix = '' if name.suffix: extracted_positions.add(name.suffix) return name.first, name.middle, name.last, extracted_positions
def to_HumanName( self ): ''' This method creates a nameparser HumanName() object instance for the Person name property values in this instance. Returns the HumanName instance. preconditions: None. postconditions: None. ''' # return reference instance_OUT = None # declare variables me = "to_HumanName" my_name_prefix = "" my_first_name = "" my_middle_name = "" my_last_name = "" my_name_suffix = "" my_nickname = "" my_full_name_string = "" my_lookup_name = "" got_name_parts = False # retrieve values from this instance my_name_prefix = self.get( self.PROP_NAME_NAME_PREFIX, None ) my_first_name = self.get( self.PROP_NAME_FIRST_NAME, None ) my_middle_name = self.get( self.PROP_NAME_MIDDLE_NAME, None ) my_last_name = self.get( self.PROP_NAME_LAST_NAME, None ) my_name_suffix = self.get( self.PROP_NAME_NAME_SUFFIX, None ) my_nickname = self.get( self.PROP_NAME_NICKNAME, None ) my_full_name_string = self.get( self.PROP_NAME_FULL_NAME_STRING, None ) my_lookup_name = self.get_lookup_name() # got name parts? got_name_parts = self.got_name_parts() if ( got_name_parts == True ): # build human name from name parts. instance_OUT = HumanName() # Use nested values to populate HumanName. if ( my_name_prefix ): instance_OUT.title = my_name_prefix #-- END check to see if name_prefix. --# if ( my_first_name ): instance_OUT.first = my_first_name #-- END check to see if first_name. --# if ( my_middle_name ): instance_OUT.middle = my_middle_name #-- END check to see if middle_name. --# if ( my_last_name ): instance_OUT.last = my_last_name #-- END check to see if last_name. --# if ( my_name_suffix ): instance_OUT.suffix = my_name_suffix #-- END check to see if name_suffix. --# if ( my_nickname ): instance_OUT.nickname = my_nickname #-- END check to see if nickname. --# # got full name string? elif ( ( my_full_name_string is not None ) and ( my_full_name_string != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_full_name_string ) # how about lookup name? elif ( ( my_lookup_name is not None ) and ( my_lookup_name != "" ) ): # yes. Pass it to HumanName instance_OUT = HumanName( my_lookup_name ) else: # no names present at all. Return None. instance_OUT = None #-- END check to see what name information we have --# return instance_OUT #-- END method to_HumanName() --# #-- END class PersonDetails --#
def match(name1, name2): n1, n2 = HumanName(name1), HumanName(name2) return (any( u(x) == u(y) for x in get_potential_names(n1) for y in get_potential_names(n2))) with open('ap_candidates.csv') as f: reader = csv.DictReader(f) ap_candidates = [row for row in reader] for row in ap_candidates: n = HumanName() n.first = row['first_name'] n.middle = row['middle_name'] n.last = row['last_name'] n.suffix = row['suffix'] row['name'] = str(n) with open('ap_historical_ids.csv') as f: reader = csv.DictReader(f) ap_candidates2 = [row for row in reader] def find(name): for row in ap_candidates: if match(name, row['name']): # print(f'found match for {name} with', row['name']) return int(row['pol_id'])