def test_keep_emojis(self): constants = Constants() constants.regexes.emoji = False hn = HumanName("∫≜⩕ Smith😊", constants) assert hn.first == "∫≜⩕" assert hn.last == "Smith😊" assert u(hn) == "∫≜⩕ Smith😊"
def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING, string_format=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() self.ENCODING = encoding self.string_format = string_format or self.C.string_format # full_name setter triggers the parse self.full_name = full_name
def extract_name(info): from nameparser.config import Constants constants = Constants() constants.titles.add('Shri', 'Smt') from nameparser import HumanName name = HumanName(info, constants=constants) if name.middle != '': fullname = name.first + ' ' + name.middle + ' ' + name.last else: fullname = name.first + ' ' + name.last return (fullname, name)
def _prepare_nameparser_constants(): """Prepare nameparser Constants. Remove nameparser's titles and use our own and add as suffixes the roman numerals. Configuration is the same for all names (i.e. instances). """ constants = Constants() roman_numeral_suffixes = [u'v', u'vi', u'vii', u'viii', u'ix', u'x', u'xii', u'xiii', u'xiv', u'xv'] titles = [u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', u'Mrs', u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs'] constants.titles.remove(*constants.titles).add(*titles) constants.suffix_not_acronyms.add(*roman_numeral_suffixes) return constants
def __init__(self, full_name="", constants=CONSTANTS, encoding=ENCODING, string_format=None): global CONSTANTS self.C = constants if not self.C: self.C = Constants() if self.C is not CONSTANTS: self.has_own_config = True self.ENCODING = encoding self.string_format = string_format or self.C.string_format self.full_name = full_name
def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, string_format=None, initials_format=None, initials_delimiter=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() self.encoding = encoding self.string_format = string_format or self.C.string_format self.initials_format = initials_format or self.C.initials_format self.initials_delimiter = initials_delimiter or self.C.initials_delimiter # full_name setter triggers the parse self.full_name = full_name
def split_name_str_2(person_str: str) -> Tuple[str, str, str]: from nameparser.config import Constants constants = Constants() constants.titles.add("Prof.", "Ing.", "B.Sc.", "h.", "c.", "e.") constants.prefixes.add( "Baronin", "Baron", "Freiherr", "Frhr.", "Fürstin", "Fürst", "Gräfin", "Graf", "Prinzessin", "Prinz", "von", "van", "de", "vom", "zu") hn = HumanName(person_str, constants=constants) title = hn.title forename = hn.first surname = hn.last return title, forename, surname
def __init__(self, full_name="", constants=CONSTANTS, encoding=DEFAULT_ENCODING, string_format=None): self.C = constants if type(self.C) is not type(CONSTANTS): self.C = Constants() self.encoding = encoding self.string_format = string_format or self.C.string_format self._nickname_regexes = [ tpl[1] for tpl in REGEXES if isinstance(tpl[-1], str) and 'nickname' in tpl[-1] ] # full_name setter triggers the parse #======================================================== #IMPORTANT NOTE: # The followint statement must be the last one in the # __init__ function #======================================================== self.full_name = full_name
def normalize_author_name(author): """Normalize author name. :param author: author name :type author: string :return name: the name of the author normilized """ constants = Constants() roman_numeral_suffixes = [ u'v', u'vi', u'vii', u'viii', u'ix', u'x', u'xii', u'xiii', u'xiv', u'xv' ] titles = [ u'Dr', u'Prof', u'Professor', u'Sir', u'Editor', u'Ed', u'Mr', u'Mrs', u'Ms', u'Chair', u'Co-Chair', u'Chairs', u'co-Chairs' ] constants.titles.remove(*constants.titles).add(*titles) constants.suffix_not_acronyms.add(*roman_numeral_suffixes) def _is_initial(author_name): return len(author_name) == 1 or u'.' in author_name def _ensure_dotted_initials(author_name): if _is_initial(author_name)\ and u'.' not in author_name: seq = (author_name, u'.') author_name = u''.join(seq) return author_name def _ensure_dotted_suffixes(author_suffix): if u'.' not in author_suffix: seq = (author_suffix, u'.') author_suffix = u''.join(seq) return author_suffix def _is_roman_numeral(suffix): """Controls that the userinput only contains valid roman numerals""" valid_roman_numerals = [ u'M', u'D', u'C', u'L', u'X', u'V', u'I', u'(', u')' ] return all(letters in valid_roman_numerals for letters in suffix.upper()) name = HumanName(author, constants=constants) name.first = _ensure_dotted_initials(name.first) name.middle = _ensure_dotted_initials(name.middle) if _is_initial(name.first) and _is_initial(name.middle): normalized_names = u'{first_name}{middle_name}' else: normalized_names = u'{first_name} {middle_name}' normalized_names = normalized_names.format( first_name=name.first, middle_name=name.middle, ) if _is_roman_numeral(name.suffix): suffix = name.suffix.upper() else: suffix = _ensure_dotted_suffixes(name.suffix) final_name = u', '.join(part for part in (name.last, normalized_names.strip(), suffix) if part) return final_name
from nameparser import HumanName as OriginalHumanName from nameparser.config import Constants # Disable stripping emoji from names # https://nameparser.readthedocs.io/en/latest/customize.html#don-t-remove-emojis constants = Constants() constants.regexes.emoji = False class HumanName(OriginalHumanName): def __init__(self, *args, **kwargs): super().__init__(*args, constants=constants, **kwargs)
def populateObjectFromHTML(tree): proplist = [] base_url = "https://www.museumfuernaturkunde.berlin" # scrapes the contact info section arguments = ['Name','Email', 'Telefon', 'Fax', 'Adresse'] for info in arguments: try: proplist.append(tree.find('div', class_=("views-field views-field-"+info)).find('span', class_="field-content").get_text().replace("\r\n", ",")) except AttributeError: proplist.append(None) #scrapes the photo URI proplist.append(base_url + tree.find('div', class_="views-field views-field-img-URL").span.img.get('src')) #scrapes the accordion accordion = {} # get all accordion entries for element in tree.find_all('section', class_="ui_segment_accordion"): titel = normalizeTitel(re.sub(r"[^\w .()]", "", element.find('h2', class_="ui_segment_accordion__head").get_text()).strip()) # get all publications and parse them by <br/>'s if titel == "Publikationen": accordion[titel] = parseInformation(element, "ui_segment_accordion__content", 'list') # search in the "Forschung" entry for an "Forschungsprojekte" entry to extract it elif titel == "Forschung": research = [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p', 'h2'])] groupedResearch = [list(group) for k, group in groupby(research, lambda x: re.match(r".{0,5}(Forschungsprojekt|Projekt)e?:?$", x)) if not k] if len(groupedResearch) > 1: if 'Forschungsprojekte' in accordion: accordion['Forschungsprojekte'] += groupedResearch.pop(1) else: accordion['Forschungsprojekte'] = groupedResearch.pop(1) if len(groupedResearch) > 0: accordion[titel] = [element for sublist in groupedResearch for element in sublist if element] # if nothing matches just get the text of the element else: if titel in accordion: accordion[titel] += [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el] else: accordion[titel] = [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el] proplist.append(accordion) #try to find additional informations try: for link in tree.find('div', class_="view-display-id-single_person_sidebar_view").findAll('a', href=True): print(link.text) titel = normalizeTitel(link.text.strip()) if titel == 'Lebenslauf': print("CV Gefunden") infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml') proplist[6]['CV'] = parseInformation(infoTree, "faqfield-answer", 'text') elif titel == 'Publikationen': print("Publikation Gefunden") infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml') if titel in proplist[6]: proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'list') else: proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'list') else: print("Etwas anderes Gefunden") infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml') if titel in proplist[6]: proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'text') else: proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'text') except AttributeError: print('No additional information was found') # delete extra headlines if 'Forschung' in proplist[6]: proplist[6]['Forschung'] = [val for val in proplist[6]['Forschung'] if not match(val)] if 'Publikationen' in proplist[6]: proplist[6]['Publikationen'] = [val for val in proplist[6]['Publikationen'] if not match(val)] # set up the name parser constants = Constants() constants.titles.add('PD', 'Dipl.', 'des.', 'Professor', 'M.Sc.', 'FH') # parse the name and normalize weird writing styles for "Ph.D." proplist[0] = HumanName(re.sub(r"Ph\. D\.", "Ph.D." ,proplist[0]), constants=constants).as_dict() return proplist
def test_add_constant_with_explicit_encoding(self): c = Constants() c.titles.add_with_encoding(b"b\351ck", encoding="latin_1") assert "béck" in c.titles
def test_config_pickle(self): constants = Constants() self.dill.pickles(constants)