Example #1
0
    def extract_name_prefix(last):
        names = split_tex_string(last, 1)
        if len(names) == 1:
            return names

        result = [names[0]]

        new_names = split_tex_string(names[1], 1)
        while len(new_names) > 1 and new_names[0].islower():
            result[0] = u' '.join((result[0], new_names[0]))
            names = new_names
            new_names = split_tex_string(names[1], 1)

        result.append(names[1])

        return result
Example #2
0
 def extract_middle_names(first):
     return split_tex_string(first, 1)
Example #3
0
def tokenize_name(name_str):
    u'''
    Takes a string representing a name and returns a NameResult breaking that
    string into its component parts, as defined in the LaTeX book and BibTeXing.

    The supported formats are thus:

    First von Last
    von Last, First
    von Last, Jr, First

    We try to follow the rules in BibTeXing relatively strictly, meaning that the
    first of these formats can result in unexpected results because it is more
    ambiguous with complex names.
    '''

    def extract_middle_names(first):
        return split_tex_string(first, 1)

    def extract_name_prefix(last):
        names = split_tex_string(last, 1)
        if len(names) == 1:
            return names

        result = [names[0]]

        new_names = split_tex_string(names[1], 1)
        while len(new_names) > 1 and new_names[0].islower():
            result[0] = u' '.join((result[0], new_names[0]))
            names = new_names
            new_names = split_tex_string(names[1], 1)

        result.append(names[1])

        return result

    name_str = name_str.strip()

    parts = split_tex_string(name_str, sep=r',[\s~]*')
    if len(parts) == 1:
        # first last
        # reverse the string so split only selects the right-most instance of the token
        try:
            last, first = [part[::-1] for part in split_tex_string(parts[0][::-1], 1)]
        except ValueError:
            # we only have a single name
            return NameResult(
                parts[0],
                '', '', '', ''
            )

        # because of our splitting method, van, von, della, etc. may end up at the end of the first name field
        first_parts = split_tex_string(first)
        first_parts_len = len(first_parts)
        if first_parts_len > 1:
            lower_name_index = None
            for i, part in enumerate(first_parts[::-1], 1):
                if part.islower():
                    if lower_name_index is None or lower_name_index == i - 1:
                        lower_name_index = i
                    else:
                        break
            if lower_name_index is not None:
                last = u' '.join((
                    u' '.join(first_parts[-lower_name_index:]),
                    last
                ))
                first = u' '.join(first_parts[:-lower_name_index])

        forenames = extract_middle_names(first)
        lastnames = extract_name_prefix(last)
        return NameResult(
            forenames[0] if len(forenames) > 0 else '',
            forenames[1] if len(forenames) > 1 else '',
            lastnames[0] if len(lastnames) > 1 else '',
            lastnames[1] if len(lastnames) > 1 else lastnames[0],
            ''
        )
    elif len(parts) == 2:
        # last, first
        last, first = parts

        # for consistency with spaces being stripped in first last format
        first = u' '.join((s for s in split_tex_string(first)))
        last = u' '.join((s for s in split_tex_string(last)))

        forenames = extract_middle_names(first)
        lastnames = extract_name_prefix(last)

        if len(lastnames) > 1:
            name_index = 0
            for part in lastnames:
                if part.islower():
                    name_index += 1
                else:
                    break

        return NameResult(
            forenames[0] if len(forenames) > 0 else '',
            forenames[1] if len(forenames) > 1 else '',
            u' '.join(lastnames[:name_index]) if len(lastnames) > 1 else '',
            u' '.join(lastnames[name_index:]) if len(lastnames) > 1 else lastnames[0],
            ''
        )
    elif len(parts) == 3:
        # last, generation, first
        last, generation, first = parts
        forenames = extract_middle_names(first)
        lastnames = extract_name_prefix(last)
        return NameResult(
            forenames[0] if len(forenames) > 0 else '',
            forenames[1] if len(forenames) > 1 else '',
            lastnames[0] if len(lastnames) > 1 else '',
            lastnames[1] if len(lastnames) > 1 else lastnames[0],
            generation
        )
    else:
        raise ValueError(u'Unrecognised name format for "{0}"'.format(name_str))