def test_keep_emojis(self):
     constants = Constants()
     constants.regexes.emoji = False
     hn = HumanName("∫≜⩕ Smith😊", constants)
     assert hn.first == "∫≜⩕"
     assert hn.last == "Smith😊"
     assert u(hn) == "∫≜⩕ Smith😊"
Exemple #2
0
    def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING,
                string_format=None):
        self.C = constants
        if type(self.C) is not type(CONSTANTS):
            self.C = Constants()

        self.ENCODING = encoding
        self.string_format = string_format or self.C.string_format
        # full_name setter triggers the parse
        self.full_name = full_name
Exemple #3
0
def extract_name(info):
    from nameparser.config import Constants
    constants = Constants()
    constants.titles.add('Shri', 'Smt')
    from nameparser import HumanName
    name = HumanName(info, constants=constants)
    if name.middle != '':
        fullname = name.first + ' ' + name.middle + ' ' + name.last
    else:
        fullname = name.first + ' ' + name.last
    return (fullname, name)
Exemple #4
0
def _prepare_nameparser_constants():
    """Prepare nameparser Constants.

    Remove nameparser's titles and use our own and add as suffixes the roman numerals.
    Configuration is the same for all names (i.e. instances).
    """
    constants = Constants()
    roman_numeral_suffixes = [u'v', u'vi', u'vii', u'viii', u'ix', u'x',
                              u'xii', u'xiii', u'xiv', u'xv']
    titles = [u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr',
              u'Mrs', u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs']
    constants.titles.remove(*constants.titles).add(*titles)
    constants.suffix_not_acronyms.add(*roman_numeral_suffixes)
    return constants
    def __init__(self,
                 full_name="",
                 constants=CONSTANTS,
                 encoding=ENCODING,
                 string_format=None):
        global CONSTANTS
        self.C = constants
        if not self.C:
            self.C = Constants()
        if self.C is not CONSTANTS:
            self.has_own_config = True

        self.ENCODING = encoding
        self.string_format = string_format or self.C.string_format
        self.full_name = full_name
    def __init__(self,
                 full_name="",
                 constants=CONSTANTS,
                 encoding=DEFAULT_ENCODING,
                 string_format=None,
                 initials_format=None,
                 initials_delimiter=None):
        self.C = constants
        if type(self.C) is not type(CONSTANTS):
            self.C = Constants()

        self.encoding = encoding
        self.string_format = string_format or self.C.string_format
        self.initials_format = initials_format or self.C.initials_format
        self.initials_delimiter = initials_delimiter or self.C.initials_delimiter
        # full_name setter triggers the parse
        self.full_name = full_name
def split_name_str_2(person_str: str) -> Tuple[str, str, str]:
    from nameparser.config import Constants

    constants = Constants()
    constants.titles.add("Prof.", "Ing.", "B.Sc.", "h.",  "c.", "e.")
    constants.prefixes.add(
        "Baronin", "Baron", "Freiherr", "Frhr.",
        "Fürstin", "Fürst", "Gräfin", "Graf",
        "Prinzessin", "Prinz", "von", "van", "de",
        "vom", "zu")

    hn = HumanName(person_str, constants=constants)
    
    title = hn.title
    forename = hn.first
    surname = hn.last

    return title, forename, surname
Exemple #8
0
    def __init__(self,
                 full_name="",
                 constants=CONSTANTS,
                 encoding=DEFAULT_ENCODING,
                 string_format=None):
        self.C = constants
        if type(self.C) is not type(CONSTANTS):
            self.C = Constants()

        self.encoding = encoding
        self.string_format = string_format or self.C.string_format
        self._nickname_regexes = [
            tpl[1] for tpl in REGEXES
            if isinstance(tpl[-1], str) and 'nickname' in tpl[-1]
        ]
        # full_name setter triggers the parse
        #========================================================
        #IMPORTANT NOTE:
        #  The followint statement must be the last one in the
        #  __init__ function
        #========================================================
        self.full_name = full_name
Exemple #9
0
def normalize_author_name(author):
    """Normalize author name.

    :param author: author name
    :type author: string

    :return name: the name of the author normilized
    """
    constants = Constants()
    roman_numeral_suffixes = [
        u'v', u'vi', u'vii', u'viii', u'ix', u'x', u'xii', u'xiii', u'xiv',
        u'xv'
    ]
    titles = [
        u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', u'Mrs',
        u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs'
    ]
    constants.titles.remove(*constants.titles).add(*titles)
    constants.suffix_not_acronyms.add(*roman_numeral_suffixes)

    def _is_initial(author_name):
        return len(author_name) == 1 or u'.' in author_name

    def _ensure_dotted_initials(author_name):
        if _is_initial(author_name)\
                and u'.' not in author_name:
            seq = (author_name, u'.')
            author_name = u''.join(seq)
        return author_name

    def _ensure_dotted_suffixes(author_suffix):
        if u'.' not in author_suffix:
            seq = (author_suffix, u'.')
            author_suffix = u''.join(seq)
        return author_suffix

    def _is_roman_numeral(suffix):
        """Controls that the userinput only contains valid roman numerals"""
        valid_roman_numerals = [
            u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')'
        ]
        return all(letters in valid_roman_numerals
                   for letters in suffix.upper())

    name = HumanName(author, constants=constants)

    name.first = _ensure_dotted_initials(name.first)
    name.middle = _ensure_dotted_initials(name.middle)

    if _is_initial(name.first) and _is_initial(name.middle):
        normalized_names = u'{first_name}{middle_name}'
    else:
        normalized_names = u'{first_name} {middle_name}'

    normalized_names = normalized_names.format(
        first_name=name.first,
        middle_name=name.middle,
    )

    if _is_roman_numeral(name.suffix):
        suffix = name.suffix.upper()
    else:
        suffix = _ensure_dotted_suffixes(name.suffix)

    final_name = u', '.join(part for part in (name.last,
                                              normalized_names.strip(), suffix)
                            if part)

    return final_name
Exemple #10
0
from nameparser import HumanName as OriginalHumanName
from nameparser.config import Constants

# Disable stripping emoji from names
# https://nameparser.readthedocs.io/en/latest/customize.html#don-t-remove-emojis

constants = Constants()
constants.regexes.emoji = False


class HumanName(OriginalHumanName):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, constants=constants, **kwargs)
Exemple #11
0
    def populateObjectFromHTML(tree):
        proplist = []
        base_url = "https://www.museumfuernaturkunde.berlin"
        # scrapes the contact info section
        arguments = ['Name','Email', 'Telefon', 'Fax', 'Adresse']
        for info in arguments:
            try:
                proplist.append(tree.find('div', class_=("views-field views-field-"+info)).find('span', class_="field-content").get_text().replace("\r\n", ","))
            except AttributeError:
                proplist.append(None)

        #scrapes the photo URI
        proplist.append(base_url + tree.find('div', class_="views-field views-field-img-URL").span.img.get('src'))

        #scrapes the accordion
        accordion = {}
        # get all accordion entries
        for element in tree.find_all('section', class_="ui_segment_accordion"):
            titel = normalizeTitel(re.sub(r"[^\w .()]", "", element.find('h2', class_="ui_segment_accordion__head").get_text()).strip())
            # get all publications and parse them by <br/>'s
            if titel == "Publikationen":
                accordion[titel] = parseInformation(element, "ui_segment_accordion__content", 'list')
            # search in the "Forschung" entry for an "Forschungsprojekte" entry to extract it
            elif titel == "Forschung":
                research = [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p', 'h2'])]
                groupedResearch = [list(group) for k, group in groupby(research, lambda x: re.match(r".{0,5}(Forschungsprojekt|Projekt)e?:?$", x)) if not k]
                if len(groupedResearch) > 1:
                    if 'Forschungsprojekte' in accordion:
                        accordion['Forschungsprojekte'] += groupedResearch.pop(1)
                    else:
                        accordion['Forschungsprojekte'] = groupedResearch.pop(1)
                if len(groupedResearch) > 0:
                    accordion[titel] = [element for sublist in groupedResearch for element in sublist if element]
            # if nothing matches just get the text of the element
            else:
                if titel in accordion:
                    accordion[titel] += [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el]
                else:
                    accordion[titel] = [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el]
        proplist.append(accordion)

        #try to find additional informations
        try:
            for link in tree.find('div', class_="view-display-id-single_person_sidebar_view").findAll('a', href=True):
                print(link.text)
                titel = normalizeTitel(link.text.strip())
                if titel == 'Lebenslauf':
                    print("CV Gefunden")
                    infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml')
                    proplist[6]['CV'] = parseInformation(infoTree, "faqfield-answer", 'text')
                elif titel == 'Publikationen':
                    print("Publikation Gefunden")
                    infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml')
                    if titel in proplist[6]:
                        proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'list')
                    else:
                        proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'list')
                else:
                    print("Etwas anderes Gefunden")
                    infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml')
                    if titel in proplist[6]:
                        proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'text')
                    else:
                        proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'text')
        except AttributeError:
            print('No additional information was found')


        # delete extra headlines
        if 'Forschung' in proplist[6]:
            proplist[6]['Forschung'] = [val for val in proplist[6]['Forschung'] if not match(val)]
        if 'Publikationen' in proplist[6]:
            proplist[6]['Publikationen'] = [val for val in proplist[6]['Publikationen'] if not match(val)]



        # set up the name parser
        constants = Constants()
        constants.titles.add('PD', 'Dipl.', 'des.', 'Professor', 'M.Sc.', 'FH')
        # parse the name and normalize weird writing styles for "Ph.D."
        proplist[0] = HumanName(re.sub(r"Ph\. D\.", "Ph.D." ,proplist[0]), constants=constants).as_dict()
        return proplist
 def test_add_constant_with_explicit_encoding(self):
     c = Constants()
     c.titles.add_with_encoding(b"b\351ck", encoding="latin_1")
     assert "béck" in c.titles
 def test_config_pickle(self):
     constants = Constants()
     self.dill.pickles(constants)