def test_assignment_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.last = "de la Vega"
     assert hn.last == "de la Vega"
     hn.title = "test"
     assert hn.title == "test"
     hn.first = "test"
     assert hn.first == "test"
     hn.middle = "test"
     assert hn.middle == "test"
     hn.suffix = "test"
     assert hn.suffix == "test"
     with pytest.raises(TypeError):
         hn.suffix = [["test"]]
     with pytest.raises(TypeError):
         hn.suffix = {"test": "test"}
Example #2
0
 def test_assignment_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.last = "de la Vega"
     self.m(hn.last,"de la Vega", hn)
     hn.title = "test"
     self.m(hn.title,"test", hn)
     hn.first = "test"
     self.m(hn.first,"test", hn)
     hn.middle = "test"
     self.m(hn.middle,"test", hn)
     hn.suffix = "test"
     self.m(hn.suffix,"test", hn)
 def test_formating_removing_pieces_from_name_buckets(self):
     hn = HumanName("Rev John A. Kenneth Doe III (Kenny)")
     hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'"
     assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'"
     hn.string_format = "{title} {first} {middle} {last} {suffix}"
     assert u(hn) == "Rev John A. Kenneth Doe III"
     hn.middle = ""
     assert u(hn) == "Rev John Doe III"
     hn.suffix = ""
     assert u(hn) == "Rev John Doe"
     hn.title = ""
     assert u(hn) == "John Doe"
 def test_assign_list_to_attribute(self):
     hn = HumanName("John A. Kenneth Doe, Jr.")
     hn.title = ["test1", "test2"]
     assert hn.title == "test1 test2"
     hn.first = ["test3", "test4"]
     assert hn.first == "test3 test4"
     hn.middle = ["test5", "test6", "test7"]
     assert hn.middle == "test5 test6 test7"
     hn.last = ["test8", "test9", "test10"]
     assert hn.last == "test8 test9 test10"
     hn.suffix = ["test"]
     assert hn.suffix == "test"
Example #5
0
 def from_parts(cls,
                first=None,
                last=None,
                middle=None,
                suffix=None,
                title=None):
     name = HumanName()
     name.first = first
     name.middle = middle
     name.last = last
     name.suffix = suffix
     name.title = title
     return ParsedName(name)
Example #6
0
 def _massage_measure_donor_name(self, name_string):
     """
     """
     name = HumanName(name_string)
     name.first = name.first.title()
     name.last = name.last.title()
     if name.middle:
         name.middle = name.middle.replace(".", "")
         name.middle = "%s." % (name.middle.title())
     if name == "JR. Munger CHARLES T.":
         name.first = "Charles"
         name.middle = "T."
         name.last = "Munger"
         name.suffix = "Jr."
     if name == "M. Quinn. Delaney":
         name.first = "M."
         name.middle = "Quinn"
         name.last = "Delaney"
         name.suffix = None
     if name == "Robert Alan. Eustace":
         name.first = "Robert"
         name.middle = "Alan"
         name.last = "Eustace"
         name.suffix = None
     if name == "Susie Tompkins. Buell":
         name.first = "Susie"
         name.middle = "Tompkins"
         name.last = "Buell"
         name.suffix = None
     if name.middle and name.suffix:
         output = "%s %s %s %s" % (name.first, name.middle, name.last,
                                   name.suffix)
     if name.middle:
         output = "%s %s %s" % (name.first, name.middle, name.last)
     elif name.suffix:
         output = "%s %s %s" % (name.first, name.last, name.suffix)
     else:
         output = "%s %s" % (name.first, name.last)
     return output
 def _massage_measure_donor_name(self, name_string):
     """
     """
     name = HumanName(name_string)
     name.first = name.first.title()
     name.last = name.last.title()
     if name.middle:
         name.middle = name.middle.replace(".", "")
         name.middle = "%s." % (name.middle.title())
     if name == "JR. Munger CHARLES T.":
         name.first = "Charles"
         name.middle = "T."
         name.last = "Munger"
         name.suffix = "Jr."
     if name == "M. Quinn. Delaney":
         name.first = "M."
         name.middle = "Quinn"
         name.last = "Delaney"
         name.suffix = None
     if name == "Robert Alan. Eustace":
         name.first = "Robert"
         name.middle = "Alan"
         name.last = "Eustace"
         name.suffix = None
     if name == "Susie Tompkins. Buell":
         name.first = "Susie"
         name.middle = "Tompkins"
         name.last = "Buell"
         name.suffix = None
     if name.middle and name.suffix:
         output = "%s %s %s %s" % (name.first, name.middle, name.last, name.suffix)
     if name.middle:
         output = "%s %s %s" % (name.first, name.middle, name.last)
     elif name.suffix:
         output = "%s %s %s" % (name.first, name.last, name.suffix)
     else:
         output = "%s %s" % (name.first, name.last)
     return output
Example #8
0
    def HumanNameFmXML(self, ell):
        hn = HumanName()
        for el in ell:
            if el.tag == 'First':
                hn.first = el.text
            elif el.tag == 'Middle':
                hn.middle = el.text
            elif el.tag == 'Last':
                hn.last = el.text
            elif el.tag == 'Title':
                hn.title = el.text
            elif el.tag == 'Suffix':
                hn.suffix = el.text
            elif el.tag == 'NickName':
                hn.nickname = el.text
            else:
                pass

        return hn
Example #9
0
    def person_name_from_xml(self, ell):
        '''Create a person mane from an XML element.'''
        hname = HumanName()
        for elm in ell:
            if elm.tag == 'First':
                hname.first = elm.text
            elif elm.tag == 'Middle':
                hname.middle = elm.text
            elif elm.tag == 'Last':
                hname.last = elm.text
            elif elm.tag == 'Title':
                hname.title = elm.text
            elif elm.tag == 'Suffix':
                hname.suffix = elm.text
            elif elm.tag == 'NickName':
                hname.nickname = elm.text
            else:
                pass

        return hname
Example #10
0
    def parse_raw_name(name_raw: str, count: int, extract_orgs=True) -> (str, str, str, Counter):
        """
        Parses a (usually messy) raw name and returns
        first, middle, last names and a Counter of extracted positions

        extract_orgs tries to extract organizations from name. defaults to True. only set to False
        to be able to check if a name is valid (it prevents an infinite loop because by default,
        extracting organizations is part of the initialization of a person

        :param name_raw: str
        :param count: int
        :param extract_orgs: bool
        :return: str, str, str, Counter (first name, middle name, last name, positions Counter)
        """
        name_raw = Person.remove_privlog_info(name_raw)
        # remove JR, SR, or III if it follows this pattern: 'Chumney-RD-Jr'
        name_raw = Person.remove_jr_sr_iii(name_raw)

        # position is often attached with a dash,
        # e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS'
        if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2:
            name_raw, extracted_position = name_raw.split(" - ")
            extracted_positions = [extracted_position.strip()]
        else:
            extracted_positions = []

        # extract positions in parens e.g. Henson, A (Chadbourne & Park)
        paren_positions = re.findall(r'\([^(]+\)', name_raw)
        for position in paren_positions:
            extracted_positions.append(position.strip(',#() '))
            name_raw = name_raw.replace(position, '')

        # Search for known raw_org strings in name_raw, extract them as positions if necessary
        if extract_orgs:
            name_raw, new_positions = Person.extract_raw_org_names_from_name(name_raw)
            extracted_positions += new_positions

        # delete any leftover hashtags
        name_raw = name_raw.strip(' #')

        # Delete dashes between last name and initials
        # DUNN-W -> Dunn W
        if name_raw[-2] == '-':
            name_raw = name_raw[:-2] + " " + name_raw[-1:]
        # DUNN-WL -> DUNN WL
        if len(name_raw) > 2 and name_raw[-3] == '-':
            name_raw = name_raw[:-3] + " " + name_raw[-2:]

        # Parse current string using HumanName
        name = HumanName(name_raw)

        # e.g. Dunn W -> parsed as last name W. -> switch first/last
        if len(name.last) <= 2 < len(name.first):
            name.first, name.last = name.last, name.first

        # remove periods from initials
        if len(name.first) == 2 and name.first[1] == '.':
            name.first = name.first[0]
        if len(name.middle) == 2 and name.middle[1] == '.':
            name.middle = name.middle[0]

        # If first name is length 2 (Teague, CE), the two letters are most likely initials.
        if len(name.middle) == 0 and len(name.first) == 2:
            name.middle = name.first[1].upper()
            name.first = name.first[0].upper()

        # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague"
        if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first):
            name.middle = name.first[2]
            name.first = name.first[0]

        name.last = name.last.capitalize()
        name.first = name.first.capitalize()
        name.middle = name.middle.capitalize()

        # if multiple names are passed, they often end up in the middle name
        # e.g. 'Holtzman, A.,  Murray, J. ,  Henson, A.  -> only allow one comma or set to empty
        if name.middle.count(',') > 1:
            name.middle = ''

        if len(name.suffix) > 20 and name.suffix.count('.') > 2:
            name.suffix = ''

        if name.suffix:
            extracted_positions.append(name.suffix)

        # map organization names to clean official names (if they are in the dict) using
        # RAW_ORG_TO_CLEAN_ORG_DICT
        clean_orgs = []
        for raw_org in extracted_positions:
            if raw_org in RAW_ORG_TO_CLEAN_ORG_DICT:
                clean_org = RAW_ORG_TO_CLEAN_ORG_DICT[raw_org]
                if clean_org != '@skip@':
                    clean_orgs.append(clean_org)
            else:
                clean_orgs.append(raw_org)
        extracted_positions = clean_orgs

        # convert mapped positions into a counter
        result_positions = Counter()
        for position in extracted_positions:
            cleaned = re.sub(r'\.', '', position)
            result_positions[cleaned.upper()] += count

        # print(name.first, name.middle, name.last, result_positions)
        return name.first, name.middle, name.last, result_positions
Example #11
0
def human_to_csl(name):
    """Convert HumanName to CSL-formatted JSON.

    Args:
        name : HumanName or str / unicode
    Returns:
        CSL-formatted JSON

    Examples:
    >>> csl = human_to_csl('Rafael Nadal')
    >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'}
    True
    >>> csl = human_to_csl(HumanName('Rafael Nadal'))
    >>> csl == {'given' : 'Rafael', 'family' : 'Nadal'}
    True
    >>> csl = human_to_csl(HumanName('George HW de Bush'))
    >>> csl == {'given' : 'George H. W.', 'family' : 'de Bush'}
    True
    >>> csl = human_to_csl('Eisenhower, I')
    >>> csl == {'given' : 'I.', 'family' : 'Eisenhower'}
    True
    >>> csl = human_to_csl('Eisenhower, V')
    >>> csl == {'given' : 'V.', 'family' : 'Eisenhower'}
    True
    """
    # Optionally convert to nameparser.HumanName
    if not isinstance(name, HumanName):
        name = HumanName(name)
    
    # Fix: nameparser treats HumanName('Eisenhower, I') as 
    # {first : 'Eisenhower', suffix : 'I'}
    if re.search('^[IV]\.*$', name.suffix):
        name.last = name.first
        name.first = name.suffix
        name.suffix = ''

    # Initialize CSL data
    csl_data = {}
    
    # Append middle name to first
    if name.middle:
        name.first += ' ' + name.middle

    # Iterate over lookup fields
    for lookup in human_to_csl_map:
        
        # Get field and function
        field = human_to_csl_map[lookup]['field']
        fun = human_to_csl_map[lookup].get('fun', I)
        
        # Get field from name
        value = getattr(name, field)

        # Skip if empty
        if not value:
            continue

        # Apply function
        value = fun(value)
        
        # Save to CSL data
        csl_data[lookup] = value

    # Return CSL data
    return csl_data
    def parse_raw_name(name_raw: str) -> (str, str, str, set):
        """
        Parses a (usually messy) raw name and returns
        first, middle, last names and a set of extracted positions

        :param name_raw: str
        :return: str, str, str, set


        Parses name and returns as human name
        >>> n = Person('TEAGUE CE JR')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'C', 'E', 'JR')

        >>> n = Person('teague, ce jr')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'C', 'E', 'JR')


        >>> n = Person('Teague, Claude Edward, Jr., Ph.D. ')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'Claude', 'Edward', 'JR., PH.D.')

        >>> n = Person('Teague, J - BAT')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Teague', 'J', '', 'BAT')

        >>> n = Person('BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS GENERAL')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Baker', 'T', 'E', 'NATIONAL ASSOCIATION OF ATTORNEYS GENERAL')

        >>> n = Person('BAKER-cj')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Baker', 'C', 'J', '')

        JR and SR are by default recognized as titles -> turn off through CONSTANTS.
        >>> n = Person('Baker, JR')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Baker', 'J', 'R', '')

        >>> n = Person('DUNN WL #')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Dunn', 'W', 'L', '')

        >>> n = Person('Dunn, W. L.')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Dunn', 'W', 'L', '')

        >>> n = Person('TEMKO SL, COVINGTON AND BURLING')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Temko', 'S', 'L', 'COVINGTON AND BURLING')

        >>> n = Person('Temko, Stanley L [Privlog:] TEMKO,SL')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Temko', 'Stanley', 'L', '')

        >>> n = Person('Temko-SL, Covington & Burling')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Temko', 'S', 'L', 'COVINGTON & BURLING')

        >>> n = Person('HENSON, A. (AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL)')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Henson', 'A', '', 'AMERICAN SENIOR VICE PRESIDENT AND GENERAL COUNSEL')

        >>> n = Person('HENSON, A. (CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL) (HANDWRITTEN NOTES)')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Henson', 'A', '', 'CHADBOURNE, PARKE, WHITESIDE & WOLFF, AMERICAN OUTSIDE COUNSEL HANDWRITTEN NOTES')

        >>> n = Person('Holtzman, A.,  Murray, J. ,  Henson, A. ,  Pepples, E. ,  Stevens, A. ,  Witt, S.')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Holtzman', 'A', '', '')

        >>> n = Person('Holtz, Jacob, Jacob & Medinger')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Holtz', 'Jacob', '', 'JACOB & MEDINGER')

        # This one breaks. But I don't think it can be avoided.
        >>> n = Person('Holtz, Jacob Alexander, Jacob & Medinger')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Holtz', '', '', 'JACOB ALEXANDER, JACOB & MEDINGER')

        >>> n = Person('PROCTOR DF, JOHNS HOPKINS SCHOOL OF HYGIENE')
        >>> n.last, n.first, n.middle, " ".join(n.positions).upper()
        ('Proctor', 'D', 'F', 'JOHNS HOPKINS SCHOOL OF HYGIENE')

        """

        # remove privlog info, e.g. 'Temko, Stanley L [Privlog:] TEMKO,SL'. It confuses
        # the name parser
        privlog_id = name_raw.find('[Privlog:]')
        if privlog_id == 0:
            name_raw = name_raw[privlog_id:]
        elif privlog_id > 0:
            name_raw = name_raw[:name_raw.find('[Privlog:]')]
        else:
            pass

        # position is often attached with a dash, e.g. 'BAKER, T E - NATIONAL ASSOCIATION OF ATTORNEYS'
        if name_raw.find(" - ") > -1 and len(name_raw.split(' - ')) == 2:
            name_raw, extracted_position = name_raw.split(" - ")
            extracted_positions = {extracted_position.strip()}
        else:
            extracted_positions = set()

        # extract positions in parens e.g. Henson, A (Chadbourne & Park)
        paren_positions = re.findall(r'\([^(]+\)', name_raw)
        for position in paren_positions:
            extracted_positions.add(position.strip(',#() '))
            name_raw = name_raw.replace(position, '')

        institution_regexes = [

            # TI/CTR
            r'[,#] Tobacco Inst.+$',
            r'[\(\,\#] ?SAB Exec.*$',

            # American Tobacco
            r'[(,#] ?American .+$',
            r'[\(\,\#] ?Amer Brands.*$',
            r'[,#] American Tob',
            r'[,#] Atco.*$',

            # PM
            r'[\(\,\#] ?Philip Morris.*$',

            # RJR
            r'[\(\,\#] ?RJR.*$',

            # LAW FIRMS
            r'[\(\,\#] ?Arnold &.*$',
            r'[\(\,\#] ?Chadbourne.*$',
            r'[,#] COVINGTON [AB&]*.+$',
            r'[,#] Foster [&A]*.+$',
            r'[,#] JACOB [A&]*.+$',

            r'[\(\,\#] ?Philip Morris.*$',

            # Universities
            # match a ( or , or # at the beginning, then some characters that
            # aren't (,# until the end of the string
            r'[\(\,\#][^\(\,\#]+ Univ\b.*$',

            # Univ is fine if it appears at the end of a string (don't want to match in the
            # middle of a string, e.g "Universal"
            r'[\(\,\#][^\(\,\#]+ School\b.*$',

            # Organizations
            r'[\(\,\#][^\(\,\#]+ Federal Trade Commission.*$',

        ]
        for institution in institution_regexes:
            extracted_institution = re.search(institution, name_raw, re.IGNORECASE)
            if extracted_institution:
                extracted_positions.add(extracted_institution.group().strip(',#() '))
                name_raw = name_raw[:name_raw.find(extracted_institution.group())]

        # remove #
        name_raw = name_raw.strip("#").strip()

        if name_raw[-2] == '-':
            name_raw = name_raw[:-2] + " " + name_raw[-1:]
        if len(name_raw) > 2 and name_raw[-3] == '-':
            name_raw = name_raw[:-3] + " " + name_raw[-2:]

        name = HumanName(name_raw)

        # e.g. Dunn W -> parsed as last name W. -> switch first/last
        if len(name.last) <= 2 and len(name.first) > 2:
            name.first, name.last = name.last, name.first

        # remove periods from initials
        if len(name.first) == 2 and name.first[1] == '.':
            name.first = name.first[0]
        if len(name.middle) == 2 and name.middle[1] == '.':
            name.middle = name.middle[0]

        # If first name is length 2 (Teague, CE), the two letters are most likely initials.
        if len(name.first) == 2:
            name.middle = name.first[1].upper()
            name.first = name.first[0].upper()

        # If first and middle initials have periods but not spaces -> separate, e.g. "R.K. Teague"
        if re.match(r'[a-zA-Z]\.[a-zA-Z]\.', name.first):
            name.middle = name.first[2]
            name.first = name.first[0]

        name.last = name.last.capitalize()
        name.first = name.first.capitalize()
        name.middle = name.middle.capitalize()

        # if multiple names are passed, they often end up in the middle name
        # e.g. 'Holtzman, A.,  Murray, J. ,  Henson, A.  -> only allow one comma or set to empty
        if name.middle.count(',') > 1:
            name.middle = ''

        if len(name.suffix) > 20 and name.suffix.count('.') > 2:
            name.suffix = ''

        if name.suffix:
            extracted_positions.add(name.suffix)

        return name.first, name.middle, name.last, extracted_positions
    def to_HumanName( self ):
        
        '''
        This method creates a nameparser HumanName() object instance for the
            Person name property values in this instance.  Returns the HumanName
            instance.
           
        preconditions: None.
        postconditions: None.
        '''
        
        # return reference
        instance_OUT = None
        
        # declare variables
        me = "to_HumanName"
        my_name_prefix = ""
        my_first_name = ""
        my_middle_name = ""
        my_last_name = ""
        my_name_suffix = ""
        my_nickname = ""
        my_full_name_string = ""
        my_lookup_name = ""
        got_name_parts = False
        
        # retrieve values from this instance
        my_name_prefix = self.get( self.PROP_NAME_NAME_PREFIX, None )
        my_first_name = self.get( self.PROP_NAME_FIRST_NAME, None )
        my_middle_name = self.get( self.PROP_NAME_MIDDLE_NAME, None )
        my_last_name = self.get( self.PROP_NAME_LAST_NAME, None )
        my_name_suffix = self.get( self.PROP_NAME_NAME_SUFFIX, None )
        my_nickname = self.get( self.PROP_NAME_NICKNAME, None )
        my_full_name_string = self.get( self.PROP_NAME_FULL_NAME_STRING, None )
        my_lookup_name = self.get_lookup_name()
        
        # got name parts?
        got_name_parts = self.got_name_parts()
        if ( got_name_parts == True ):
        
            # build human name from name parts.
            instance_OUT = HumanName()
    
            # Use nested values to populate HumanName.
            if ( my_name_prefix ):
        
                instance_OUT.title = my_name_prefix
                
            #-- END check to see if name_prefix. --#
            
            if ( my_first_name ):
        
                instance_OUT.first = my_first_name
                
            #-- END check to see if first_name. --#
            
            if ( my_middle_name ):
        
                instance_OUT.middle = my_middle_name
                
            #-- END check to see if middle_name. --#
            
            if ( my_last_name ):
        
                instance_OUT.last = my_last_name
                
            #-- END check to see if last_name. --#
            
            if ( my_name_suffix ):
        
                instance_OUT.suffix = my_name_suffix
                
            #-- END check to see if name_suffix. --#
            
            if ( my_nickname ):
        
                instance_OUT.nickname = my_nickname
                
            #-- END check to see if nickname. --#
            
        # got full name string?
        elif ( ( my_full_name_string is not None ) and ( my_full_name_string != "" ) ):
        
            # yes.  Pass it to HumanName
            instance_OUT = HumanName( my_full_name_string )
        
        # how about lookup name?
        elif ( ( my_lookup_name is not None ) and ( my_lookup_name != "" ) ):
        
            # yes.  Pass it to HumanName
            instance_OUT = HumanName( my_lookup_name )
        
        else:
        
            # no names present at all.  Return None.
            instance_OUT = None
            
        #-- END check to see what name information we have --#
                
        return instance_OUT
        
    #-- END method to_HumanName() --#


#-- END class PersonDetails --#
Example #14
0

def match(name1, name2):
    n1, n2 = HumanName(name1), HumanName(name2)
    return (any(
        u(x) == u(y) for x in get_potential_names(n1)
        for y in get_potential_names(n2)))


with open('ap_candidates.csv') as f:
    reader = csv.DictReader(f)
    ap_candidates = [row for row in reader]
    for row in ap_candidates:
        n = HumanName()
        n.first = row['first_name']
        n.middle = row['middle_name']
        n.last = row['last_name']
        n.suffix = row['suffix']
        row['name'] = str(n)

with open('ap_historical_ids.csv') as f:
    reader = csv.DictReader(f)
    ap_candidates2 = [row for row in reader]


def find(name):
    for row in ap_candidates:
        if match(name, row['name']):
            # print(f'found match for {name} with', row['name'])
            return int(row['pol_id'])