def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces, e.g.: ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia'] :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: :return: new list with piece next to conjunctions merged into one piece with spaces in it. :rtype: list """ length = len(pieces) + additional_parts_count # don't join on conjuctions if there's only 2 parts if length < 3: return pieces for conj in filter(self.is_conjunction, pieces[::-1]): # reverse sorted list # loop through the pieces backwards, starting at the end of the list. # Join conjunctions to the pieces on either side of them. rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length = len(rootname_pieces) + additional_parts_count if len(conj) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes and prefixes) # and this conjunction is a single letter, prefer treating it as an initial # rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue try: i = pieces.index((conj)) except ValueError: log.error( "Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}". format(**locals())) continue if i < len(pieces) - 1: # if this is not the last piece if i is 0: # if this is the first piece and it's a conjunction nxt = pieces[i + 1] const = self.C.conjunctions if self.is_title(nxt): const = self.C.titles new_piece = ' '.join(pieces[0:2]) const.add(new_piece) pieces[i] = new_piece pieces.pop(i + 1) continue if self.is_conjunction(pieces[i - 1]): # if the piece in front of this one is a conjunction too, # add new_piece (this conjuction and the following piece) # to the conjuctions constant so that it is recognized # as a conjunction in the next loop. # e.g. for ["Lord","of","the Universe"], put "the Universe" # into the conjunctions constant. new_piece = ' '.join(pieces[i:i + 2]) self.C.conjunctions.add(new_piece) pieces[i] = new_piece pieces.pop(i + 1) continue new_piece = ' '.join(pieces[i - 1:i + 2]) if self.is_title(pieces[i - 1]): # if the second name is a title, assume the first one is too and add the # two titles with the conjunction between them to the titles constant # so the combo we just created gets parsed as a title. # e.g. "Mr. and Mrs." becomes a title. self.C.titles.add(new_piece) pieces[i - 1] = new_piece pieces.pop(i) pieces.pop(i) # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: i = pieces.index(prefixes[0]) # join everything after the prefix until the next suffix next_suffix = list(filter(self.is_suffix, pieces[i:])) if next_suffix: j = pieces.index(next_suffix[0]) new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] else: new_piece = ' '.join(pieces[i:]) pieces = pieces[:i] + [new_piece] log.debug("pieces: {0}".format(pieces)) return pieces
def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: {0}".format(self._full_name)) log.debug("parts: {0}".format(parts)) if len(parts) == 1: # no commas, title first middle middle middle last suffix # part[0] pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) and (nxt or p_len == 1) and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ ( # if the next piece is the last piece and a roman numeral # but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) self.suffix_list += pieces[i + 1:] break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece in # the first part. (Suffixes will never appear after last names only, and # allows potential first names to be in suffixes, e.g. "Johnson, Bart" if self.are_suffixes( parts[1].split(' ')) and len(parts[0].split(' ')) > 1: # suffix comma: title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: {0}".format(u(pieces))) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i + 1:]): self.last_list.append(piece) self.suffix_list = pieces[i + 1:] + self.suffix_list break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: {0}".format(u(pieces))) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: # the first one is always a last name, even if it look like a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable: \"{}\" ".format(self.original)) else: self.unparsable = False self.post_process()
def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> ['Mr. and Mrs.', 'John', 'Doe'] ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> ['The Secretary of State', 'Hillary', 'Clinton'] When joining titles, saves newly formed piece to the instance's titles constant so they will be parsed correctly later. E.g. after parsing the example names above, 'The Secretary of State' and 'Mr. and Mrs.' would be present in the titles constant set. :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: :return: new list with piece next to conjunctions merged into one piece with spaces in it. :rtype: list """ length = len(pieces) + additional_parts_count # don't join on conjunctions if there's only 2 parts if length < 3: return pieces rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length = len(rootname_pieces) + additional_parts_count # find all the conjunctions, join any conjunctions that are next to each # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it conj_index = [ i for i, piece in enumerate(pieces) if self.is_conjunction(piece) ] contiguous_conj_i = [] for i, val in enumerate(conj_index): try: if conj_index[i + 1] == val + 1: contiguous_conj_i += [val] except IndexError: pass contiguous_conj_i = group_contiguous_integers(conj_index) delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: new_piece = " ".join(pieces[i[0]:i[1] + 1]) delete_i += list(range(i[0] + 1, i[1] + 1)) pieces[i[0]] = new_piece else: new_piece = " ".join(pieces[i:i + 2]) delete_i += [i + 1] pieces[i] = new_piece #add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) for i in reversed(delete_i): # delete pieces in reverse order or the index changes on each delete del pieces[i] if len(pieces) == 1: # if there's only one piece left, nothing left to do return pieces # refresh conjunction index locations conj_index = [ i for i, piece in enumerate(pieces) if self.is_conjunction(piece) ] for i in conj_index: if len(pieces[i]) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes # and prefixes) and this conjunction is a single letter, prefer # treating it as an initial rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue if i is 0: new_piece = " ".join(pieces[i:i + 2]) if self.is_title(pieces[i + 1]): # when joining to a title, make new_piece a title too self.C.titles.add(new_piece) pieces[i] = new_piece pieces.pop(i + 1) # subtract 1 from the index of all the remaining conjunctions for j, val in enumerate(conj_index): if val > i: conj_index[j] = val - 1 else: new_piece = " ".join(pieces[i - 1:i + 2]) if self.is_title(pieces[i - 1]): # when joining to a title, make new_piece a title too self.C.titles.add(new_piece) pieces[i - 1] = new_piece pieces.pop(i) rm_count = 2 try: pieces.pop(i) except IndexError: rm_count = 1 # subtract the number of removed pieces from the index # of all the remaining conjunctions for j, val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: i = pieces.index(prefixes[0]) # join everything after the prefix until the next suffix next_suffix = list(filter(self.is_suffix, pieces[i:])) if next_suffix: j = pieces.index(next_suffix[0]) new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] else: new_piece = ' '.join(pieces[i:]) pieces = pieces[:i] + [new_piece] log.debug("pieces: {0}".format(pieces)) return pieces
def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.: ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==> ['Mr. and Mrs.', 'John', 'Doe'] ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==> ['The Secretary of State', 'Hillary', 'Clinton'] When joining titles, saves newly formed piece to the instance's titles constant so they will be parsed correctly later. E.g. after parsing the example names above, 'The Secretary of State' and 'Mr. and Mrs.' would be present in the titles constant set. :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: :return: new list with piece next to conjunctions merged into one piece with spaces in it. :rtype: list """ length = len(pieces) + additional_parts_count # don't join on conjunctions if there's only 2 parts if length < 3: return pieces rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length = len(rootname_pieces) + additional_parts_count # find all the conjunctions, join any conjunctions that are next to each # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] contiguous_conj_i = [] for i, val in enumerate(conj_index): try: if conj_index[i+1] == val+1: contiguous_conj_i += [val] except IndexError: pass contiguous_conj_i = group_contiguous_integers(conj_index) delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: new_piece = " ".join(pieces[ i[0] : i[1]+1] ) delete_i += list(range( i[0]+1, i[1]+1 )) pieces[i[0]] = new_piece else: new_piece = " ".join(pieces[ i : i+2 ]) delete_i += [i+1] pieces[i] = new_piece #add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) for i in reversed(delete_i): # delete pieces in reverse order or the index changes on each delete del pieces[i] if len(pieces) == 1: # if there's only one piece left, nothing left to do return pieces # refresh conjunction index locations conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] for i in conj_index: if len(pieces[i]) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes # and prefixes) and this conjunction is a single letter, prefer # treating it as an initial rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue if i is 0: new_piece = " ".join(pieces[i:i+2]) if self.is_title(pieces[i+1]): # when joining to a title, make new_piece a title too self.C.titles.add(new_piece) pieces[i] = new_piece pieces.pop(i+1) # subtract 1 from the index of all the remaining conjunctions for j,val in enumerate(conj_index): if val > i: conj_index[j]=val-1 else: new_piece = " ".join(pieces[i-1:i+2]) if self.is_title(pieces[i-1]): # when joining to a title, make new_piece a title too self.C.titles.add(new_piece) pieces[i-1] = new_piece pieces.pop(i) rm_count = 2 try: pieces.pop(i) except IndexError: rm_count = 1 # subtract the number of removed pieces from the index # of all the remaining conjunctions for j,val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: for prefix in prefixes: try: i = pieces.index(prefix) except ValueError: # If the prefix is no longer in pieces, it's because it has been # combined with the prefix that appears right before (or before that when # chained together) in the last loop, so the index of that newly created # piece is the same as in the last loop, i==i still, and we want to join # it to the next piece. pass new_piece = '' # join everything after the prefix until the next prefix or suffix try: next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) j = pieces.index(next_prefix) if j == i + 1: # if there are two prefixes in sequence, join to the following piece j += 1 new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] except StopIteration: try: # if there are no more prefixes, look for a suffix to stop at stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) j = pieces.index(stop_at) new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] except StopIteration: # if there were no suffixes, nothing to stop at so join all # remaining pieces new_piece = ' '.join(pieces[i:]) pieces = pieces[:i] + [new_piece] log.debug("pieces: %s", pieces) return pieces
def parse_full_name(self): """ The main parse method for the parser. This method is run upon assignment to the :py:attr:`full_name` attribute or instantiation. Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It then splits on commas and chooses a code path depending on the number of commas. :py:func:`parse_pieces` then splits those parts on spaces and :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. """ self.title_list = [] self.first_list = [] self.middle_list = [] self.last_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True self.pre_process() self._full_name = self.collapse_whitespace(self._full_name) # break up full_name by commas parts = [x.strip() for x in self._full_name.split(",")] log.debug("full_name: %s", self._full_name) log.debug("parts: %s", parts) if len(parts) == 1: # no commas, title first middle middle middle last suffix # part[0] pieces = self.parse_pieces(parts) p_len = len(pieces) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None # title must have a next piece, unless it's just a title if self.is_title(piece) \ and (nxt or p_len == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: if p_len == 1 and self.nickname: self.last_list.append(piece) continue self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]) or \ ( # if the next piece is the last piece and a roman # numeral but this piece is not an initial self.is_roman_numeral(nxt) and i == p_len - 2 and not self.is_an_initial(piece) ): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # if all the end parts are suffixes and there is more than one piece # in the first part. (Suffixes will never appear after last names # only, and allows potential first names to be in suffixes, e.g. # "Johnson, Bart" if self.are_suffixes(parts[1].split(' ')) \ and len(parts[0].split(' ')) > 1: # suffix comma: # title first middle last [suffix], suffix [suffix] [, suffix] # parts[0], parts[1:...] self.suffix_list += parts[1:] pieces = self.parse_pieces(parts[0].split(' ')) log.debug("pieces: %s", u(pieces)) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.are_suffixes(pieces[i+1:]): self.last_list.append(piece) self.suffix_list = pieces[i+1:] + self.suffix_list break if not nxt: self.last_list.append(piece) continue self.middle_list.append(piece) else: # lastname comma: # last [suffix], title first middles[,] suffix [,suffix] # parts[0], parts[1], parts[2:...] pieces = self.parse_pieces(parts[1].split(' '), 1) log.debug("pieces: %s", u(pieces)) # lastname part may have suffixes in it lastname_pieces = self.parse_pieces(parts[0].split(' '), 1) for piece in lastname_pieces: # the first one is always a last name, even if it looks like # a suffix if self.is_suffix(piece) and len(self.last_list) > 0: self.suffix_list.append(piece) else: self.last_list.append(piece) for i, piece in enumerate(pieces): try: nxt = pieces[i + 1] except IndexError: nxt = None if self.is_title(piece) \ and (nxt or len(pieces) == 1) \ and not self.first: self.title_list.append(piece) continue if not self.first: self.first_list.append(piece) continue if self.is_suffix(piece): self.suffix_list.append(piece) continue self.middle_list.append(piece) try: if parts[2]: self.suffix_list += parts[2:] except IndexError: pass if len(self) < 0: log.info("Unparsable: \"%s\" ", self.original) else: self.unparsable = False self.post_process()
def join_on_conjunctions(self, pieces, additional_parts_count=0): """ Join conjunctions to surrounding pieces, e.g.: ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia'] :param list pieces: name pieces strings after split on spaces :param int additional_parts_count: :return: new list with piece next to conjunctions merged into one piece with spaces in it. :rtype: list """ length = len(pieces) + additional_parts_count # don't join on conjuctions if there's only 2 parts if length < 3: return pieces for conj in filter(self.is_conjunction, pieces[::-1]): # reverse sorted list # loop through the pieces backwards, starting at the end of the list. # Join conjunctions to the pieces on either side of them. rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length= len(rootname_pieces) + additional_parts_count if len(conj) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes and prefixes) # and this conjunction is a single letter, prefer treating it as an initial # rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue try: i = pieces.index((conj)) except ValueError: log.error("Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}".format(**locals())) continue if i < len(pieces) - 1: # if this is not the last piece if i is 0: # if this is the first piece and it's a conjunction nxt = pieces[i+1] const = self.C.conjunctions if self.is_title(nxt): const = self.C.titles new_piece = ' '.join(pieces[0:2]) const.add(new_piece) pieces[i] = new_piece pieces.pop(i+1) continue if self.is_conjunction(pieces[i-1]): # if the piece in front of this one is a conjunction too, # add new_piece (this conjuction and the following piece) # to the conjuctions constant so that it is recognized # as a conjunction in the next loop. # e.g. for ["Lord","of","the Universe"], put "the Universe" # into the conjunctions constant. new_piece = ' '.join(pieces[i:i+2]) self.C.conjunctions.add(new_piece) pieces[i] = new_piece pieces.pop(i+1) continue new_piece = ' '.join(pieces[i-1:i+2]) if self.is_title(pieces[i-1]): # if the second name is a title, assume the first one is too and add the # two titles with the conjunction between them to the titles constant # so the combo we just created gets parsed as a title. # e.g. "Mr. and Mrs." becomes a title. self.C.titles.add(new_piece) pieces[i-1] = new_piece pieces.pop(i) pieces.pop(i) # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: i = pieces.index(prefixes[0]) # join everything after the prefix until the next suffix next_suffix = list(filter(self.is_suffix, pieces[i:])) if next_suffix: j = pieces.index(next_suffix[0]) new_piece = ' '.join(pieces[i:j]) pieces = pieces[:i] + [new_piece] + pieces[j:] else: new_piece = ' '.join(pieces[i:]) pieces = pieces[:i] + [new_piece] log.debug("pieces: {0}".format(pieces)) return pieces