def test_formating_removing_pieces_from_name_buckets(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'" hn.string_format = "{title} {first} {middle} {last} {suffix}" assert u(hn) == "Rev John A. Kenneth Doe III" hn.middle = "" assert u(hn) == "Rev John Doe III" hn.suffix = "" assert u(hn) == "Rev John Doe" hn.title = "" assert u(hn) == "John Doe"
def capitalize(self): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. It will not adjust the case of names entered in mixed case. **Usage** .. doctest:: capitalize >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> str(name) 'Bob V. de la MacDole-Eisenhower Ph.D.' >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() >>> str(name) 'Shirley Maclaine' """ name = u(self) if not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title).split(' ') self.first_list = self.cap_piece(self.first).split(' ') self.middle_list = self.cap_piece(self.middle).split(' ') self.last_list = self.cap_piece(self.last).split(' ') self.suffix_list = self.cap_piece(self.suffix).split(', ')
def capitalize(self): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. It will not adjust the case of names entered in mixed case. **Usage** .. doctest:: capitalize >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> unicode(name) u'Bob V. de la MacDole-Eisenhower Ph.D.' >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() >>> unicode(name) u'Shirley Maclaine' """ name = u(self) if not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title ).split(' ') self.first_list = self.cap_piece(self.first ).split(' ') self.middle_list = self.cap_piece(self.middle).split(' ') self.last_list = self.cap_piece(self.last ).split(' ') self.suffix_list = self.cap_piece(self.suffix).split(', ')
def capitalize(self, force=False): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. By default, it will not adjust the case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. :param bool force: force capitalization of strings that include mixed case **Usage** .. doctest:: capitalize >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> str(name) 'Bob V. de la MacDole-Eisenhower Ph.D.' >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() >>> str(name) 'Shirley Maclaine' >>> name.capitalize(force=True) >>> str(name) 'Shirley MacLaine' """ name = u(self) if not force and not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title , 'title').split(' ') self.first_list = self.cap_piece(self.first , 'first').split(' ') self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') self.last_list = self.cap_piece(self.last , 'last').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ')
def test_keep_emojis(self): constants = Constants() constants.regexes.emoji = False hn = HumanName("∫≜⩕ Smith😊", constants) assert hn.first == "∫≜⩕" assert hn.last == "Smith😊" assert u(hn) == "∫≜⩕ Smith😊"
def capitalize(self, force=False): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. By default, it will not adjust the case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. :param bool force: force capitalization of strings that include mixed case **Usage** .. doctest:: capitalize >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> str(name) 'Bob V. de la MacDole-Eisenhower Ph.D.' >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() >>> str(name) 'Shirley Maclaine' >>> name.capitalize(force=True) >>> str(name) 'Shirley MacLaine' """ name = u(self) if not force and not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title).split(' ') self.first_list = self.cap_piece(self.first).split(' ') self.middle_list = self.cap_piece(self.middle).split(' ') self.last_list = self.cap_piece(self.last).split(' ') self.suffix_list = self.cap_piece(self.suffix).split(', ')
def test_formatting_constants_attribute(self): from nameparser.config import CONSTANTS _orig = CONSTANTS.string_format CONSTANTS.string_format = "TEST2" hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") assert u(hn) == "TEST2" CONSTANTS.string_format = _orig
def test_formating_removing_keys_from_format_string(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'" hn.string_format = "{last}, {title} {first} {middle}, {suffix}" assert u(hn) == "Doe, Rev John A. Kenneth, III" hn.string_format = "{last}, {title} {first} {middle}" assert u(hn) == "Doe, Rev John A. Kenneth" hn.string_format = "{last}, {first} {middle}" assert u(hn) == "Doe, John A. Kenneth" hn.string_format = "{last}, {first}" assert u(hn) == "Doe, John" hn.string_format = "{first} {last}" assert u(hn) == "John Doe"
def capitalize(self, force=None): """ The HumanName class can try to guess the correct capitalization of name entered in all upper or lower case. By default, it will not adjust the case of names entered in mixed case. To run capitalization on all names pass the parameter `force=True`. :param bool force: Forces capitalization of mixed case strings. This parameter overrides rules set within :py:class:`~nameparser.config.CONSTANTS`. **Usage** .. doctest:: capitalize >>> name = HumanName('bob v. de la macdole-eisenhower phd') >>> name.capitalize() >>> str(name) 'Bob V. de la MacDole-Eisenhower Ph.D.' >>> # Don't touch good names >>> name = HumanName('Shirley Maclaine') >>> name.capitalize() >>> str(name) 'Shirley Maclaine' >>> name.capitalize(force=True) >>> str(name) 'Shirley MacLaine' """ name = u(self) force = self.C.force_mixed_case_capitalization \ if force is None else force if not force and not (name == name.upper() or name == name.lower()): return self.title_list = self.cap_piece(self.title, 'title').split(' ') self.first_list = self.cap_piece(self.first, 'first').split(' ') self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') self.last_list = self.cap_piece(self.last, 'last').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ')
def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True if not isinstance(self._full_name, text_type): self._full_name = u(self._full_name, self.ENCODING) self.pre_process() # collapse multiple spaces self._full_name = self.C.regexes.spaces.sub(" ", self._full_name.strip()) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: {0}".format(self._full_name)) log.debug("parts: {0}".format(parts)) if len(parts) == 1: # no commas, title first middle middle middle last suffix pieces = self.parse_pieces(parts) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) and (nxt or len(pieces) == 1): self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if (i == len(pieces) - 2) and self.is_suffix(nxt): self.last_list.append(piece) self.suffix_list.append(nxt) break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: if self.is_suffix(parts[1]): # suffix comma: title first middle last, suffix [, suffix] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: {0}".format(u(pieces))) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) and (nxt or len(pieces) == 1): self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: last, title first middles[,] suffix [,suffix] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: {0}".format(u(pieces))) self.last_list.append(parts[0]) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) and (nxt or len(pieces) == 1): self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable full_name: " + self._full_name) else: self.unparsable = False self.post_process()
def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: {0}".format(self._full_name)) log.debug("parts: {0}".format(parts)) if len(parts) == 1: # no commas, title first middle middle middle last suffix # part[0] pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) and (nxt or p_len == 1) and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ ( # if the next piece is the last piece and a roman numeral # but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) self.suffix_list += pieces[i + 1:] break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece in # the first part. (Suffixes will never appear after last names only, and # allows potential first names to be in suffixes, e.g. "Johnson, Bart" if self.are_suffixes( parts[1].split(' ')) and len(parts[0].split(' ')) > 1: # suffix comma: title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: {0}".format(u(pieces))) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i + 1:]): self.last_list.append(piece) self.suffix_list = pieces[i + 1:] + self.suffix_list break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: {0}".format(u(pieces))) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: # the first one is always a last name, even if it look like a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable: \"{}\" ".format(self.original)) else: self.unparsable = False self.post_process()
def test_formating_of_nicknames_with_parenthesis(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} ({nickname})" assert u(hn) == "Rev John A. Kenneth Doe III (Kenny)" hn.nickname = "" assert u(hn) == "Rev John A. Kenneth Doe III"
def __hash__(self): return hash((u(self)).lower())
def test_formating_of_nicknames_with_single_quotes(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'" hn.nickname = "" assert u(hn) == "Rev John A. Kenneth Doe III"
def __eq__(self, other): """ HumanName instances are equal to other objects whose lower case unicode representation is the same. """ return (u(self)).lower() == (u(other)).lower()
def __ne__(self, other): return not (u(self)).lower() == (u(other)).lower()
def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: %s", self._full_name) log.debug("parts: %s", parts) if len(parts) == 1: # no commas, title first middle middle middle last suffix # part[0] pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) \ and (nxt or p_len == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: if p_len == 1 and self.nickname: self.last_list.append(piece) continue self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ ( # if the next piece is the last piece and a roman # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names # only, and allows potential first names to be in suffixes, e.g. # "Johnson, Bart" if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: # suffix comma: # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: %s", u(pieces)) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]): self.last_list.append(piece) self.suffix_list = pieces[i+1:] + self.suffix_list break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: %s", u(pieces)) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: # the first one is always a last name, even if it looks like # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable: \"%s\" ", self.original) else: self.unparsable = False self.post_process()
def test_quote_nickname_formating(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} {middle} {last} {suffix} '{nickname}'" assert u(hn) == "Rev John A. Kenneth Doe III 'Kenny'" hn.string_format = "{last}, {title} {first} {middle}, {suffix} '{nickname}'" assert u(hn) == "Doe, Rev John A. Kenneth, III 'Kenny'"
def test_formating_of_nicknames_with_double_quotes(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = '{title} {first} {middle} {last} {suffix} "{nickname}"' assert u(hn) == 'Rev John A. Kenneth Doe III "Kenny"' hn.nickname = "" assert u(hn) == "Rev John A. Kenneth Doe III"
def test_keep_non_emojis(self): hn = HumanName("∫≜⩕ Smith 😊") assert hn.first == "∫≜⩕" assert hn.last == "Smith" assert u(hn) == "∫≜⩕ Smith"
def test_remove_emojis(self): hn = HumanName("Sam Smith 😊") assert hn.first == "Sam" assert hn.last == "Smith" assert u(hn) == "Sam Smith"
def test_formating_of_nicknames_in_middle(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)") hn.string_format = "{title} {first} ({nickname}) {middle} {last} {suffix}" assert u(hn) == "Rev John (Kenny) A. Kenneth Doe III" hn.nickname = "" assert u(hn) == "Rev John A. Kenneth Doe III"
def test_formatting_init_argument(self): hn = HumanName("Rev John A. Kenneth Doe III (Kenny)", string_format="TEST1") assert u(hn) == "TEST1"