コード例 #1
0
ファイル: parser.py プロジェクト: liormagen/python-nameparser
    def join_on_conjunctions(self, pieces, additional_parts_count=0):
        """
        Join conjunctions to surrounding pieces, e.g.:
        ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia']
        
        :param list pieces: name pieces strings after split on spaces
        :param int additional_parts_count: 
        :return: new list with piece next to conjunctions merged into one piece with spaces in it.
        :rtype: list
        
        """
        length = len(pieces) + additional_parts_count
        # don't join on conjuctions if there's only 2 parts
        if length < 3:
            return pieces

        for conj in filter(self.is_conjunction,
                           pieces[::-1]):  # reverse sorted list

            # loop through the pieces backwards, starting at the end of the list.
            # Join conjunctions to the pieces on either side of them.

            rootname_pieces = [p for p in pieces if self.is_rootname(p)]
            total_length = len(rootname_pieces) + additional_parts_count
            if len(conj) == 1 and total_length < 4:
                # if there are only 3 total parts (minus known titles, suffixes and prefixes)
                # and this conjunction is a single letter, prefer treating it as an initial
                # rather than a conjunction.
                # http://code.google.com/p/python-nameparser/issues/detail?id=11
                continue

            try:
                i = pieces.index((conj))
            except ValueError:
                log.error(
                    "Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}".
                    format(**locals()))
                continue

            if i < len(pieces) - 1:
                # if this is not the last piece

                if i is 0:
                    # if this is the first piece and it's a conjunction
                    nxt = pieces[i + 1]
                    const = self.C.conjunctions
                    if self.is_title(nxt):
                        const = self.C.titles
                    new_piece = ' '.join(pieces[0:2])
                    const.add(new_piece)
                    pieces[i] = new_piece
                    pieces.pop(i + 1)
                    continue

                if self.is_conjunction(pieces[i - 1]):

                    # if the piece in front of this one is a conjunction too,
                    # add new_piece (this conjuction and the following piece)
                    # to the conjuctions constant so that it is recognized
                    # as a conjunction in the next loop.
                    # e.g. for ["Lord","of","the Universe"], put "the Universe"
                    # into the conjunctions constant.

                    new_piece = ' '.join(pieces[i:i + 2])
                    self.C.conjunctions.add(new_piece)
                    pieces[i] = new_piece
                    pieces.pop(i + 1)
                    continue

                new_piece = ' '.join(pieces[i - 1:i + 2])
                if self.is_title(pieces[i - 1]):

                    # if the second name is a title, assume the first one is too and add the
                    # two titles with the conjunction between them to the titles constant
                    # so the combo we just created gets parsed as a title.
                    # e.g. "Mr. and Mrs." becomes a title.

                    self.C.titles.add(new_piece)

                pieces[i - 1] = new_piece
                pieces.pop(i)
                pieces.pop(i)

        # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
        prefixes = list(filter(self.is_prefix, pieces))
        if prefixes:
            i = pieces.index(prefixes[0])
            # join everything after the prefix until the next suffix
            next_suffix = list(filter(self.is_suffix, pieces[i:]))
            if next_suffix:
                j = pieces.index(next_suffix[0])
                new_piece = ' '.join(pieces[i:j])
                pieces = pieces[:i] + [new_piece] + pieces[j:]
            else:
                new_piece = ' '.join(pieces[i:])
                pieces = pieces[:i] + [new_piece]

        log.debug("pieces: {0}".format(pieces))
        return pieces
コード例 #2
0
ファイル: parser.py プロジェクト: liormagen/python-nameparser
    def parse_full_name(self):
        """
        The main parse method for the parser. This method is run upon assignment to the
        :py:attr:`full_name` attribute or instantiation.

        Basic flow is to hand off to :py:func:`pre_process` to handle nicknames. It
        then splits on commas and chooses a code path depending on the number of commas.
        :py:func:`parse_pieces` then splits those parts on spaces and
        :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. 
        """

        self.title_list = []
        self.first_list = []
        self.middle_list = []
        self.last_list = []
        self.suffix_list = []
        self.nickname_list = []
        self.unparsable = True

        self.pre_process()

        self._full_name = self.collapse_whitespace(self._full_name)

        # break up full_name by commas
        parts = [x.strip() for x in self._full_name.split(",")]

        log.debug("full_name: {0}".format(self._full_name))
        log.debug("parts: {0}".format(parts))

        if len(parts) == 1:

            # no commas, title first middle middle middle last suffix
            #            part[0]

            pieces = self.parse_pieces(parts)
            p_len = len(pieces)
            for i, piece in enumerate(pieces):
                try:
                    nxt = pieces[i + 1]
                except IndexError:
                    nxt = None

                # title must have a next piece, unless it's just a title
                if self.is_title(piece) and (nxt
                                             or p_len == 1) and not self.first:
                    self.title_list.append(piece)
                    continue
                if not self.first:
                    self.first_list.append(piece)
                    continue
                if self.are_suffixes(pieces[i+1:]) or \
                        (
                            # if the next piece is the last piece and a roman numeral
                            # but this piece is not an initial
                            self.is_roman_numeral(nxt) and i == p_len - 2
                            and not self.is_an_initial(piece)
                        ):
                    self.last_list.append(piece)
                    self.suffix_list += pieces[i + 1:]
                    break
                if not nxt:
                    self.last_list.append(piece)
                    continue

                self.middle_list.append(piece)
        else:
            # if all the end parts are suffixes and there is more than one piece in
            # the first part. (Suffixes will never appear after last names only, and
            # allows potential first names to be in suffixes, e.g. "Johnson, Bart"
            if self.are_suffixes(
                    parts[1].split(' ')) and len(parts[0].split(' ')) > 1:

                # suffix comma: title first middle last [suffix], suffix [suffix] [, suffix]
                #               parts[0],                         parts[1:...]

                self.suffix_list += parts[1:]
                pieces = self.parse_pieces(parts[0].split(' '))
                log.debug("pieces: {0}".format(u(pieces)))
                for i, piece in enumerate(pieces):
                    try:
                        nxt = pieces[i + 1]
                    except IndexError:
                        nxt = None

                    if self.is_title(piece) and (nxt or len(pieces)
                                                 == 1) and not self.first:
                        self.title_list.append(piece)
                        continue
                    if not self.first:
                        self.first_list.append(piece)
                        continue
                    if self.are_suffixes(pieces[i + 1:]):
                        self.last_list.append(piece)
                        self.suffix_list = pieces[i + 1:] + self.suffix_list
                        break
                    if not nxt:
                        self.last_list.append(piece)
                        continue
                    self.middle_list.append(piece)
            else:

                # lastname comma: last [suffix], title first middles[,] suffix [,suffix]
                #                 parts[0],      parts[1],              parts[2:...]
                pieces = self.parse_pieces(parts[1].split(' '), 1)

                log.debug("pieces: {0}".format(u(pieces)))

                # lastname part may have suffixes in it
                lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
                for piece in lastname_pieces:
                    # the first one is always a last name, even if it look like a suffix
                    if self.is_suffix(piece) and len(self.last_list) > 0:
                        self.suffix_list.append(piece)
                    else:
                        self.last_list.append(piece)

                for i, piece in enumerate(pieces):
                    try:
                        nxt = pieces[i + 1]
                    except IndexError:
                        nxt = None

                    if self.is_title(piece) and (nxt or len(pieces)
                                                 == 1) and not self.first:
                        self.title_list.append(piece)
                        continue
                    if not self.first:
                        self.first_list.append(piece)
                        continue
                    if self.is_suffix(piece):
                        self.suffix_list.append(piece)
                        continue
                    self.middle_list.append(piece)
                try:
                    if parts[2]:
                        self.suffix_list += parts[2:]
                except IndexError:
                    pass

        if len(self) < 0:
            log.info("Unparsable: \"{}\" ".format(self.original))
        else:
            self.unparsable = False
        self.post_process()
コード例 #3
0
    def join_on_conjunctions(self, pieces, additional_parts_count=0):
        """
        Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
            
            ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
                            ['Mr. and Mrs.', 'John', 'Doe']
            
            ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
                            ['The Secretary of State', 'Hillary', 'Clinton']
        
        When joining titles, saves newly formed piece to the instance's titles
        constant so they will be parsed correctly later. E.g. after parsing the
        example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
        be present in the titles constant set.
        
        :param list pieces: name pieces strings after split on spaces
        :param int additional_parts_count: 
        :return: new list with piece next to conjunctions merged into one piece 
        with spaces in it.
        :rtype: list
        
        """
        length = len(pieces) + additional_parts_count
        # don't join on conjunctions if there's only 2 parts
        if length < 3:
            return pieces

        rootname_pieces = [p for p in pieces if self.is_rootname(p)]
        total_length = len(rootname_pieces) + additional_parts_count

        # find all the conjunctions, join any conjunctions that are next to each
        # other, then join those newly joined conjunctions and any single
        # conjunctions to the piece before and after it
        conj_index = [
            i for i, piece in enumerate(pieces) if self.is_conjunction(piece)
        ]

        contiguous_conj_i = []
        for i, val in enumerate(conj_index):
            try:
                if conj_index[i + 1] == val + 1:
                    contiguous_conj_i += [val]
            except IndexError:
                pass

        contiguous_conj_i = group_contiguous_integers(conj_index)

        delete_i = []
        for i in contiguous_conj_i:
            if type(i) == tuple:
                new_piece = " ".join(pieces[i[0]:i[1] + 1])
                delete_i += list(range(i[0] + 1, i[1] + 1))
                pieces[i[0]] = new_piece
            else:
                new_piece = " ".join(pieces[i:i + 2])
                delete_i += [i + 1]
                pieces[i] = new_piece
            #add newly joined conjunctions to constants to be found later
            self.C.conjunctions.add(new_piece)

        for i in reversed(delete_i):
            # delete pieces in reverse order or the index changes on each delete
            del pieces[i]

        if len(pieces) == 1:
            # if there's only one piece left, nothing left to do
            return pieces

        # refresh conjunction index locations
        conj_index = [
            i for i, piece in enumerate(pieces) if self.is_conjunction(piece)
        ]

        for i in conj_index:
            if len(pieces[i]) == 1 and total_length < 4:
                # if there are only 3 total parts (minus known titles, suffixes
                # and prefixes) and this conjunction is a single letter, prefer
                # treating it as an initial rather than a conjunction.
                # http://code.google.com/p/python-nameparser/issues/detail?id=11
                continue

            if i is 0:
                new_piece = " ".join(pieces[i:i + 2])
                if self.is_title(pieces[i + 1]):
                    # when joining to a title, make new_piece a title too
                    self.C.titles.add(new_piece)
                pieces[i] = new_piece
                pieces.pop(i + 1)
                # subtract 1 from the index of all the remaining conjunctions
                for j, val in enumerate(conj_index):
                    if val > i:
                        conj_index[j] = val - 1

            else:
                new_piece = " ".join(pieces[i - 1:i + 2])
                if self.is_title(pieces[i - 1]):
                    # when joining to a title, make new_piece a title too
                    self.C.titles.add(new_piece)
                pieces[i - 1] = new_piece
                pieces.pop(i)
                rm_count = 2
                try:
                    pieces.pop(i)
                except IndexError:
                    rm_count = 1

                # subtract the number of removed pieces from the index
                # of all the remaining conjunctions
                for j, val in enumerate(conj_index):
                    if val > i:
                        conj_index[j] = val - rm_count

        # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
        prefixes = list(filter(self.is_prefix, pieces))
        if prefixes:
            i = pieces.index(prefixes[0])
            # join everything after the prefix until the next suffix
            next_suffix = list(filter(self.is_suffix, pieces[i:]))
            if next_suffix:
                j = pieces.index(next_suffix[0])
                new_piece = ' '.join(pieces[i:j])
                pieces = pieces[:i] + [new_piece] + pieces[j:]
            else:
                new_piece = ' '.join(pieces[i:])
                pieces = pieces[:i] + [new_piece]

        log.debug("pieces: {0}".format(pieces))
        return pieces
コード例 #4
0
ファイル: parser.py プロジェクト: derek73/python-nameparser
    def join_on_conjunctions(self, pieces, additional_parts_count=0):
        """
        Join conjunctions to surrounding pieces. Title- and prefix-aware. e.g.:
            
            ['Mr.', 'and'. 'Mrs.', 'John', 'Doe'] ==>
                            ['Mr. and Mrs.', 'John', 'Doe']
            
            ['The', 'Secretary', 'of', 'State', 'Hillary', 'Clinton'] ==>
                            ['The Secretary of State', 'Hillary', 'Clinton']
        
        When joining titles, saves newly formed piece to the instance's titles
        constant so they will be parsed correctly later. E.g. after parsing the
        example names above, 'The Secretary of State' and 'Mr. and Mrs.' would
        be present in the titles constant set.
        
        :param list pieces: name pieces strings after split on spaces
        :param int additional_parts_count: 
        :return: new list with piece next to conjunctions merged into one piece 
        with spaces in it.
        :rtype: list
        
        """
        length = len(pieces) + additional_parts_count
        # don't join on conjunctions if there's only 2 parts
        if length < 3:
            return pieces

        rootname_pieces = [p for p in pieces if self.is_rootname(p)]
        total_length = len(rootname_pieces) + additional_parts_count

        # find all the conjunctions, join any conjunctions that are next to each
        # other, then join those newly joined conjunctions and any single
        # conjunctions to the piece before and after it
        conj_index = [i for i, piece in enumerate(pieces)
                                if self.is_conjunction(piece)]

        contiguous_conj_i = []
        for i, val in enumerate(conj_index):
            try:
                if conj_index[i+1] == val+1:
                    contiguous_conj_i += [val]
            except IndexError:
                pass

        contiguous_conj_i = group_contiguous_integers(conj_index)

        delete_i = []
        for i in contiguous_conj_i:
            if type(i) == tuple:
                new_piece = " ".join(pieces[ i[0] : i[1]+1] )
                delete_i += list(range( i[0]+1, i[1]+1 ))
                pieces[i[0]] = new_piece
            else:
                new_piece = " ".join(pieces[ i : i+2 ])
                delete_i += [i+1]
                pieces[i] = new_piece
            #add newly joined conjunctions to constants to be found later
            self.C.conjunctions.add(new_piece)

        for i in reversed(delete_i):
            # delete pieces in reverse order or the index changes on each delete
            del pieces[i]

        if len(pieces) == 1:
            # if there's only one piece left, nothing left to do
            return pieces

        # refresh conjunction index locations
        conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]

        for i in conj_index:
            if len(pieces[i]) == 1 and total_length < 4:
                # if there are only 3 total parts (minus known titles, suffixes
                # and prefixes) and this conjunction is a single letter, prefer
                # treating it as an initial rather than a conjunction.
                # http://code.google.com/p/python-nameparser/issues/detail?id=11
                continue

            if i is 0:
                new_piece = " ".join(pieces[i:i+2])
                if self.is_title(pieces[i+1]):
                    # when joining to a title, make new_piece a title too
                    self.C.titles.add(new_piece)
                pieces[i] = new_piece
                pieces.pop(i+1)
                # subtract 1 from the index of all the remaining conjunctions
                for j,val in enumerate(conj_index):
                    if val > i:
                        conj_index[j]=val-1

            else:
                new_piece = " ".join(pieces[i-1:i+2])
                if self.is_title(pieces[i-1]):
                    # when joining to a title, make new_piece a title too
                    self.C.titles.add(new_piece)
                pieces[i-1] = new_piece
                pieces.pop(i)
                rm_count = 2
                try:
                    pieces.pop(i)
                except IndexError:
                    rm_count = 1

                # subtract the number of removed pieces from the index
                # of all the remaining conjunctions
                for j,val in enumerate(conj_index):
                    if val > i:
                        conj_index[j] = val - rm_count


        # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
        prefixes = list(filter(self.is_prefix, pieces))
        if prefixes:
            for prefix in prefixes:
                try:
                    i = pieces.index(prefix)
                except ValueError:
                    # If the prefix is no longer in pieces, it's because it has been
                    # combined with the prefix that appears right before (or before that when
                    # chained together) in the last loop, so the index of that newly created
                    # piece is the same as in the last loop, i==i still, and we want to join
                    # it to the next piece.
                    pass

                new_piece = ''

                # join everything after the prefix until the next prefix or suffix

                try:
                    next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
                    j = pieces.index(next_prefix)
                    if j == i + 1:
                        # if there are two prefixes in sequence, join to the following piece
                        j += 1
                    new_piece = ' '.join(pieces[i:j])
                    pieces = pieces[:i] + [new_piece] + pieces[j:]
                except StopIteration:
                    try:
                        # if there are no more prefixes, look for a suffix to stop at
                        stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
                        j = pieces.index(stop_at)
                        new_piece = ' '.join(pieces[i:j])
                        pieces = pieces[:i] + [new_piece] + pieces[j:]
                    except StopIteration:
                        # if there were no suffixes, nothing to stop at so join all
                        # remaining pieces
                        new_piece = ' '.join(pieces[i:])
                        pieces = pieces[:i] + [new_piece]

        log.debug("pieces: %s", pieces)
        return pieces
コード例 #5
0
ファイル: parser.py プロジェクト: derek73/python-nameparser
    def parse_full_name(self):
        """
        
        The main parse method for the parser. This method is run upon
        assignment to the :py:attr:`full_name` attribute or instantiation.

        Basic flow is to hand off to :py:func:`pre_process` to handle
        nicknames. It then splits on commas and chooses a code path depending
        on the number of commas.
        
        :py:func:`parse_pieces` then splits those parts on spaces and
        :py:func:`join_on_conjunctions` joins any pieces next to conjunctions. 
        """
        
        self.title_list = []
        self.first_list = []
        self.middle_list = []
        self.last_list = []
        self.suffix_list = []
        self.nickname_list = []
        self.unparsable = True
        
        
        self.pre_process()
        
        self._full_name = self.collapse_whitespace(self._full_name)
        
        # break up full_name by commas
        parts = [x.strip() for x in self._full_name.split(",")]
        
        log.debug("full_name: %s", self._full_name)
        log.debug("parts: %s", parts)
        
        if len(parts) == 1:
            
            # no commas, title first middle middle middle last suffix
            #            part[0]
            
            pieces = self.parse_pieces(parts)
            p_len = len(pieces)
            for i, piece in enumerate(pieces):
                try:
                    nxt = pieces[i + 1]
                except IndexError:
                    nxt = None
                
                # title must have a next piece, unless it's just a title
                if self.is_title(piece) \
                        and (nxt or p_len == 1) \
                        and not self.first:
                    self.title_list.append(piece)
                    continue
                if not self.first:
                    if p_len == 1 and self.nickname:
                        self.last_list.append(piece)
                        continue
                    self.first_list.append(piece)
                    continue
                if self.are_suffixes(pieces[i+1:]) or \
                        ( 
                            # if the next piece is the last piece and a roman
                            # numeral but this piece is not an initial
                            self.is_roman_numeral(nxt) and i == p_len - 2 
                            and not self.is_an_initial(piece)
                        ):
                    self.last_list.append(piece)
                    self.suffix_list += pieces[i+1:]
                    break
                if not nxt:
                    self.last_list.append(piece)
                    continue
                
                self.middle_list.append(piece)
        else:
            # if all the end parts are suffixes and there is more than one piece
            # in the first part. (Suffixes will never appear after last names
            # only, and allows potential first names to be in suffixes, e.g.
            # "Johnson, Bart"
            if self.are_suffixes(parts[1].split(' ')) \
                    and len(parts[0].split(' ')) > 1:
                
                # suffix comma: 
                # title first middle last [suffix], suffix [suffix] [, suffix]
                #               parts[0],          parts[1:...]
               
                
                self.suffix_list += parts[1:]
                pieces = self.parse_pieces(parts[0].split(' '))
                log.debug("pieces: %s", u(pieces))
                for i, piece in enumerate(pieces):
                    try:
                        nxt = pieces[i + 1]
                    except IndexError:
                        nxt = None

                    if self.is_title(piece) \
                            and (nxt or len(pieces) == 1) \
                            and not self.first:
                        self.title_list.append(piece)
                        continue
                    if not self.first:
                        self.first_list.append(piece)
                        continue
                    if self.are_suffixes(pieces[i+1:]):
                        self.last_list.append(piece)
                        self.suffix_list = pieces[i+1:] + self.suffix_list
                        break
                    if not nxt:
                        self.last_list.append(piece)
                        continue
                    self.middle_list.append(piece)
            else:
                
                # lastname comma: 
                # last [suffix], title first middles[,] suffix [,suffix]
                #      parts[0],      parts[1],              parts[2:...]
                pieces = self.parse_pieces(parts[1].split(' '), 1)
                
                log.debug("pieces: %s", u(pieces))
                
                # lastname part may have suffixes in it
                lastname_pieces = self.parse_pieces(parts[0].split(' '), 1)
                for piece in lastname_pieces:
                    # the first one is always a last name, even if it looks like
                    # a suffix
                    if self.is_suffix(piece) and len(self.last_list) > 0:
                        self.suffix_list.append(piece)
                    else:
                        self.last_list.append(piece)
                
                for i, piece in enumerate(pieces):
                    try:
                        nxt = pieces[i + 1]
                    except IndexError:
                        nxt = None
                    
                    if self.is_title(piece) \
                            and (nxt or len(pieces) == 1) \
                            and not self.first:
                        self.title_list.append(piece)
                        continue
                    if not self.first:
                        self.first_list.append(piece)
                        continue
                    if self.is_suffix(piece):
                        self.suffix_list.append(piece)
                        continue
                    self.middle_list.append(piece)
                try:
                    if parts[2]:
                        self.suffix_list += parts[2:]
                except IndexError:
                    pass
                
        if len(self) < 0:
            log.info("Unparsable: \"%s\" ", self.original)
        else:
            self.unparsable = False
        self.post_process()
コード例 #6
0
ファイル: parser.py プロジェクト: BrettLB/python-nameparser
 def join_on_conjunctions(self, pieces, additional_parts_count=0):
     """
     Join conjunctions to surrounding pieces, e.g.:
     ['Mr. and Mrs.'], ['King of the Hill'], ['Jack and Jill'], ['Velasquez y Garcia']
     
     :param list pieces: name pieces strings after split on spaces
     :param int additional_parts_count: 
     :return: new list with piece next to conjunctions merged into one piece with spaces in it.
     :rtype: list
     
     """
     length = len(pieces) + additional_parts_count
     # don't join on conjuctions if there's only 2 parts
     if length < 3:
         return pieces
     
     for conj in filter(self.is_conjunction, pieces[::-1]): # reverse sorted list
         
         # loop through the pieces backwards, starting at the end of the list.
         # Join conjunctions to the pieces on either side of them.
         
         rootname_pieces = [p for p in pieces if self.is_rootname(p)]
         total_length= len(rootname_pieces) + additional_parts_count
         if len(conj) == 1 and total_length < 4:
             # if there are only 3 total parts (minus known titles, suffixes and prefixes) 
             # and this conjunction is a single letter, prefer treating it as an initial
             # rather than a conjunction.
             # http://code.google.com/p/python-nameparser/issues/detail?id=11
             continue
         
         try:
             i = pieces.index((conj))
         except ValueError:
             log.error("Couldn't find '{conj}' in pieces. i={i}, pieces={pieces}".format(**locals()))
             continue
         
         if i < len(pieces) - 1: 
             # if this is not the last piece
             
             if i is 0:
                 # if this is the first piece and it's a conjunction
                 nxt = pieces[i+1]
                 const = self.C.conjunctions
                 if self.is_title(nxt):
                     const = self.C.titles
                 new_piece = ' '.join(pieces[0:2])
                 const.add(new_piece)
                 pieces[i] = new_piece
                 pieces.pop(i+1)
                 continue
             
             if self.is_conjunction(pieces[i-1]):
                 
                 # if the piece in front of this one is a conjunction too,
                 # add new_piece (this conjuction and the following piece) 
                 # to the conjuctions constant so that it is recognized
                 # as a conjunction in the next loop. 
                 # e.g. for ["Lord","of","the Universe"], put "the Universe"
                 # into the conjunctions constant.
                 
                 new_piece = ' '.join(pieces[i:i+2])
                 self.C.conjunctions.add(new_piece)
                 pieces[i] = new_piece
                 pieces.pop(i+1)
                 continue
             
             new_piece = ' '.join(pieces[i-1:i+2])
             if self.is_title(pieces[i-1]):
                 
                 # if the second name is a title, assume the first one is too and add the 
                 # two titles with the conjunction between them to the titles constant 
                 # so the combo we just created gets parsed as a title. 
                 # e.g. "Mr. and Mrs." becomes a title.
                 
                 self.C.titles.add(new_piece)
             
             pieces[i-1] = new_piece
             pieces.pop(i)
             pieces.pop(i)
     
     # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
     prefixes = list(filter(self.is_prefix, pieces))
     if prefixes:
         i = pieces.index(prefixes[0])
         # join everything after the prefix until the next suffix
         next_suffix = list(filter(self.is_suffix, pieces[i:]))
         if next_suffix:
             j = pieces.index(next_suffix[0])
             new_piece = ' '.join(pieces[i:j])
             pieces = pieces[:i] + [new_piece] + pieces[j:]
         else:
             new_piece = ' '.join(pieces[i:])
             pieces = pieces[:i] + [new_piece]
         
     log.debug("pieces: {0}".format(pieces))
     return pieces