def __init__(self, string="", first="", middle="", prelast="", last="", lineage=""): """ :param string: The full name string. It will be parsed and split into separate first, last, middle, pre-last and lineage name parst. Supported name formats are: - von Last, First - von Last, Jr, First - First von Last (see BibTeX manual for explanation) """ self.first_names = [] self.middle_names = [] self.prelast_names = [] self.last_names = [] self.lineage_names = [] string = string.strip() if string: self._parse_string(string) self.first_names.extend(split_tex_string(first)) self.middle_names.extend(split_tex_string(middle)) self.prelast_names.extend(split_tex_string(prelast)) self.last_names.extend(split_tex_string(last)) self.lineage_names.extend(split_tex_string(lineage))
def __init__(self, string="", first="", middle="", prelast="", last="", lineage=""): self._first = [] self._middle = [] self._prelast = [] self._last = [] self._lineage = [] string = string.strip() if string: self.parse_string(string) self._first.extend(split_tex_string(first)) self._middle.extend(split_tex_string(middle)) self._prelast.extend(split_tex_string(prelast)) self._last.extend(split_tex_string(last)) self._lineage.extend(split_tex_string(lineage))
def parse_string(self, name): """Extract various parts of the name from a string. Supported formats are: - von Last, First - von Last, Jr, First - First von Last (see BibTeX manual for explanation) """ def process_first_middle(parts): try: self._first.append(parts[0]) self._middle.extend(parts[1:]) except IndexError: pass def process_von_last(parts): von, last = rsplit_at(parts, lambda part: part.islower()) if von and not last: last.append(von.pop()) self._prelast.extend(von) self._last.extend(last) def find_pos(lst, pred): for i, item in enumerate(lst): if pred(item): return i return i + 1 def split_at(lst, pred): """Split the given list into two parts. The second part starts with the first item for which the given predicate is True. """ pos = find_pos(lst, pred) return lst[:pos], lst[pos:] def rsplit_at(lst, pred): rpos = find_pos(reversed(lst), pred) pos = len(lst) - rpos return lst[:pos], lst[pos:] parts = split_tex_string(name, ',') if len(parts) == 3: # von Last, Jr, First process_von_last(split_tex_string(parts[0])) self._lineage.extend(split_tex_string(parts[1])) process_first_middle(split_tex_string(parts[2])) elif len(parts) == 2: # von Last, First process_von_last(split_tex_string(parts[0])) process_first_middle(split_tex_string(parts[1])) elif len(parts) == 1: # First von Last parts = split_tex_string(name) first_middle, von_last = split_at(parts, lambda part: part.islower()) if not von_last and first_middle: last = first_middle.pop() von_last.append(last) process_first_middle(first_middle) process_von_last(von_last) else: raise PybtexError('Invalid name format: %s' % name)
def parse_string(self, name): """Extract various parts of the name from a string. Supported formats are: - von Last, First - von Last, Jr, First - First von Last (see BibTeX manual for explanation) """ def process_first_middle(parts): try: self._first.append(parts[0]) self._middle.extend(parts[1:]) except IndexError: pass def process_von_last(parts): i = 0 for i, part in enumerate(reversed(parts[:-1])): if part.islower(): break pos = len(parts) - i - 1 von = parts[:pos] last = parts[pos:] self._prelast.extend(von) self._last.extend(last) def split_at(lst, pred): """Split the given list into two parts. The second part starts with the first item for which the given predicate is True. If the predicate is False for all items, the last element still comes to the last part. This is how BibTeX parses names. """ for i, item in enumerate(lst): if pred(item): break return lst[:i], lst[i:] parts = split_tex_string(name, ",") if len(parts) == 3: # von Last, Jr, First process_von_last(split_tex_string(parts[0])) self._lineage.extend(split_tex_string(parts[1])) process_first_middle(split_tex_string(parts[2])) elif len(parts) == 2: # von Last, First process_von_last(split_tex_string(parts[0])) process_first_middle(split_tex_string(parts[1])) elif len(parts) == 1: # First von Last parts = split_tex_string(name) first_middle, von_last = split_at(parts, lambda part: part.islower()) process_first_middle(first_middle) process_von_last(von_last) else: raise PybtexError("Invalid name format: %s" % name)
def _parse_string(self, name): """Extract various parts of the name from a string. >>> p = Person('Avinash K. Dixit') >>> print(p.first_names) [u'Avinash'] >>> print(p.middle_names) [u'K.'] >>> print(p.prelast_names) [] >>> print(p.last_names) [u'Dixit'] >>> print(p.lineage_names) [] >>> print(six.text_type(p)) Dixit, Avinash K. >>> p == Person(six.text_type(p)) True >>> p = Person('Dixit, Jr, Avinash K. ') >>> print(p.first_names) [u'Avinash'] >>> print(p.middle_names) [u'K.'] >>> print(p.prelast_names) [] >>> print(p.last_names) [u'Dixit'] >>> print(p.lineage_names) [u'Jr'] >>> print(six.text_type(p)) Dixit, Jr, Avinash K. >>> p == Person(six.text_type(p)) True >>> p = Person('abc') >>> print(p.first_names, p.middle_names, p.prelast_names, p.last_names, p.lineage_names) [] [] [] [u'abc'] [] >>> p = Person('Viktorov, Michail~Markovitch') >>> print(p.first_names, p.middle_names, p.prelast_names, p.last_names, p.lineage_names) [u'Michail'] [u'Markovitch'] [] [u'Viktorov'] [] """ def process_first_middle(parts): try: self.first_names.append(parts[0]) self.middle_names.extend(parts[1:]) except IndexError: pass def process_von_last(parts): # von cannot be the last name in the list von_last = parts[:-1] definitely_not_von = parts[-1:] if von_last: von, last = rsplit_at(von_last, is_von_name) self.prelast_names.extend(von) self.last_names.extend(last) self.last_names.extend(definitely_not_von) def find_pos(lst, pred): for i, item in enumerate(lst): if pred(item): return i return i + 1 def split_at(lst, pred): """Split the given list into two parts. The second part starts with the first item for which the given predicate is True. """ pos = find_pos(lst, pred) return lst[:pos], lst[pos:] def rsplit_at(lst, pred): rpos = find_pos(reversed(lst), pred) pos = len(lst) - rpos return lst[:pos], lst[pos:] def is_von_name(string): if string[0].isupper(): return False if string[0].islower(): return True else: for char, brace_level in scan_bibtex_string(string): if brace_level == 0 and char.isalpha(): return char.islower() elif brace_level == 1 and char.startswith('\\'): return special_char_islower(char) return False def special_char_islower(special_char): control_sequence = True for char in special_char[1:]: # skip the backslash if control_sequence: if not char.isalpha(): control_sequence = False else: if char.isalpha(): return char.islower() return False parts = split_tex_string(name, ',') if len(parts) > 3: report_error(InvalidNameString(name)) last_parts = parts[2:] parts = parts[:2] + [' '.join(last_parts)] if len(parts) == 3: # von Last, Jr, First process_von_last(split_tex_string(parts[0])) self.lineage_names.extend(split_tex_string(parts[1])) process_first_middle(split_tex_string(parts[2])) elif len(parts) == 2: # von Last, First process_von_last(split_tex_string(parts[0])) process_first_middle(split_tex_string(parts[1])) elif len(parts) == 1: # First von Last parts = split_tex_string(name) first_middle, von_last = split_at(parts, is_von_name) if not von_last and first_middle: last = first_middle.pop() von_last.append(last) process_first_middle(first_middle) process_von_last(von_last) else: # should hot really happen raise ValueError(name)
def _parse_string(self, name): """Extract various parts of the name from a string. >>> p = Person('Avinash K. Dixit') >>> print p.first_names ['Avinash'] >>> print p.middle_names ['K.'] >>> print p.prelast_names [] >>> print p.last_names ['Dixit'] >>> print p.lineage_names [] >>> print unicode(p) Dixit, Avinash K. >>> p == Person(unicode(p)) True >>> p = Person('Dixit, Jr, Avinash K. ') >>> print p.first_names ['Avinash'] >>> print p.middle_names ['K.'] >>> print p.prelast_names [] >>> print p.last_names ['Dixit'] >>> print p.lineage_names ['Jr'] >>> print unicode(p) Dixit, Jr, Avinash K. >>> p == Person(unicode(p)) True >>> p = Person('abc') >>> print p.first_names, p.middle_names, p.prelast_names, p.last_names, p.lineage_names [] [] [] ['abc'] [] >>> p = Person('Viktorov, Michail~Markovitch') >>> print p.first_names, p.middle_names, p.prelast_names, p.last_names, p.lineage_names ['Michail'] ['Markovitch'] [] ['Viktorov'] [] """ def process_first_middle(parts): try: self.first_names.append(parts[0]) self.middle_names.extend(parts[1:]) except IndexError: pass def process_von_last(parts): # von cannot be the last name in the list von_last = parts[:-1] definitely_not_von = parts[-1:] if von_last: von, last = rsplit_at(von_last, is_von_name) self.prelast_names.extend(von) self.last_names.extend(last) self.last_names.extend(definitely_not_von) def find_pos(lst, pred): for i, item in enumerate(lst): if pred(item): return i return i + 1 def split_at(lst, pred): """Split the given list into two parts. The second part starts with the first item for which the given predicate is True. """ pos = find_pos(lst, pred) return lst[:pos], lst[pos:] def rsplit_at(lst, pred): rpos = find_pos(reversed(lst), pred) pos = len(lst) - rpos return lst[:pos], lst[pos:] def is_von_name(string): if string[0].isupper(): return False if string[0].islower(): return True else: for char, brace_level in scan_bibtex_string(string): if brace_level == 0 and char.isalpha(): return char.islower() elif brace_level == 1 and char.startswith('\\'): return special_char_islower(char) return False def special_char_islower(special_char): control_sequence = True for char in special_char[1:]: # skip the backslash if control_sequence: if not char.isalpha(): control_sequence = False else: if char.isalpha(): return char.islower() return False parts = split_tex_string(name, ',') if len(parts) > 3: report_error(InvalidNameString(name)) last_parts = parts[2:] parts = parts[:2] + [' '.join(last_parts)] if len(parts) == 3: # von Last, Jr, First process_von_last(split_tex_string(parts[0])) self.lineage_names.extend(split_tex_string(parts[1])) process_first_middle(split_tex_string(parts[2])) elif len(parts) == 2: # von Last, First process_von_last(split_tex_string(parts[0])) process_first_middle(split_tex_string(parts[1])) elif len(parts) == 1: # First von Last parts = split_tex_string(name) first_middle, von_last = split_at(parts, is_von_name) if not von_last and first_middle: last = first_middle.pop() von_last.append(last) process_first_middle(first_middle) process_von_last(von_last) else: # should hot really happen raise ValueError(name)
def __init__(self, string="", first="", middle="", prelast="", last="", lineage=""): """ :param string: The full name string. It will be parsed and split into separate first, last, middle, pre-last and lineage name parst. Supported name formats are: - von Last, First - von Last, Jr, First - First von Last (see BibTeX manual for explanation) """ self.first_names = [] """ A list of first names. .. versionadded:: 0.19 Earlier versions used :py:meth:`.first`, which is now deprecated. """ self.middle_names = [] """ A list of middle names. .. versionadded:: 0.19 Earlier versions used :py:meth:`.middle`, which is now deprecated. """ self.prelast_names = [] """ A list of pre-last (aka von) name parts. .. versionadded:: 0.19 Earlier versions used :py:meth:`.middle`, which is now deprecated. """ self.last_names = [] """ A list of last names. .. versionadded:: 0.19 Earlier versions used :py:meth:`.last`, which is now deprecated. """ self.lineage_names = [] """ A list of linage (aka Jr) name parts. .. versionadded:: 0.19 Earlier versions used :py:meth:`.lineage`, which is now deprecated. """ string = string.strip() if string: self._parse_string(string) self.first_names.extend(split_tex_string(first)) self.middle_names.extend(split_tex_string(middle)) self.prelast_names.extend(split_tex_string(prelast)) self.last_names.extend(split_tex_string(last)) self.lineage_names.extend(split_tex_string(lineage))
def filter_bibentry(self, entry): # # entry is a pybtex.database.Entry object # # first apply filters that are applied to all fields of the entry def thefilter(x): if self.fix_space_after_escape: x = do_fix_space_after_escape(x) if self.fix_swedish_a: # OBSOLETE, but still accepted for backwards compatibility x = re.sub(r'\\AA\s+', r'\\AA{}', x) x = re.sub(r'\\o\s+', r'\\o{}', x) if self.encode_utf8_to_latex: # use custom encoder x = custom_utf8tolatex(x) if self.encode_latex_to_utf8: x = butils.latex_to_text(x) return x def filter_person(p): oldpstr = unicodestr(p) #print(oldpstr) newpstr = thefilter(oldpstr) #print(newpstr) return Person(string=newpstr) # does not work this way because of the way Person() splits at spaces: #parts = {} #for typ in ['first', 'middle', 'prelast', 'last', 'lineage']: # parts[typ] = thefilter(u" ".join(p.get_part(typ))) #return Person(**parts) for (role,perslist) in iteritems(entry.persons): for k in range(len(perslist)): entry.persons[role][k] = filter_person(perslist[k]) for (k,v) in iteritems(entry.fields): entry.fields[k] = thefilter(v) logger.longdebug("entry %s passed basic filter: %r", entry.key, entry) # additionally: if self.unprotect_full_last_names: for (role,perslist) in iteritems(entry.persons): for p in perslist: if len(p.last_names) == 1: lname = remove_full_braces(p.last_names[0]) p.last_names = split_tex_string(lname) def filter_entry_remove_type_from_phd(entry): if (entry.type != 'phdthesis' or 'type' not in entry.fields): return if ('phd' in re.sub(r'[^a-z]', '', entry.fields['type'].lower())): # entry is phd type, so remove explicit type={} del entry.fields['type'] if (self.remove_type_from_phd): filter_entry_remove_type_from_phd(entry) if (self.remove_pages_from_book): if (entry.type == 'book' and 'pages' in entry.fields): del entry.fields['pages'] # # do this before 'self.remove_full_braces', because the latter depends on language # if (self.rename_language): if 'language' in entry.fields: logger.longdebug('Maybe fixing language in entry %s: lang=%r', entry.key, entry.fields['language']) entry.fields['language'] = self.rename_language_rx.sub( lambda m: self.rename_language.get(m.group('lang').lower(), m.group('lang')), entry.fields['language'] ) logger.longdebug(' --> language is now = %r', entry.fields['language']) def filter_entry_remove_full_braces(entry, fieldlist): for k,v in iteritems(entry.fields): if fieldlist is None or k in fieldlist: entry.fields[k] = remove_full_braces(v) if self.remove_full_braces: if entry.fields.get('language','').lower() not in self.remove_full_braces_not_lang: filter_entry_remove_full_braces(entry, self.remove_full_braces_fieldlist) if (self.map_annote_to_note): if 'annote' in entry.fields: thenote = '' if len(entry.fields.get('note', '')): thenote = entry.fields['note'] + '; ' entry.fields['note'] = thenote + entry.fields['annote'] del entry.fields['annote'] if (self.auto_urlify): for fld in self.auto_urlify: if fld in entry.fields: entry.fields[fld] = do_auto_urlify(entry.fields[fld]) def filter_protect_names(entry): def repl_ltx_str(n, r, x): # scan string until next '{', read latex expression and skip it, etc. lw = latexwalker.LatexWalker(x, tolerant_parsing=True) pos = 0 newx = u'' therx = re.compile(r'((?P<openbrace>\{)|'+r.pattern+r')', re.IGNORECASE) while True: m = therx.search(x, pos) if m is None: newx += x[pos:] break newpos = m.start() newx += x[pos:newpos] if m.group('openbrace'): # we encountered an opening brace, so we need to copy in everything verbatim (junknode, np, nl) = lw.get_latex_expression(newpos) # just copy the contents as is and move on newx += x[newpos:np+nl] newpos = np + nl else: # we found an instance of the string we wanted to protect, so protect it: newx += '{' + n + '}' newpos = m.end() # continue from our last position pos = newpos return newx for key, val in iteritems(entry.fields): if key in ('doi', 'url', 'file'): continue newval = val for n,r in self.protect_names: newval = repl_ltx_str(n, r, newval) if (newval != val): entry.fields[key] = newval if (self.protect_names): filter_protect_names(entry) # include stuff like: # # title = "{\textquotedblleft}Relative State{\textquotedblright} Formulation of Quantum Mechanics" # _rx_prcap_lead = r'([^\w\{]|\\[A-Za-z]+|\{\\[A-Za-z]+\})*' if (self.protect_capital_letter_after_dot): for fld in self.protect_capital_letter_after_dot: if fld in entry.fields: entry.fields[fld] = re.sub(r'(?P<dotlead>[.:]'+_rx_prcap_lead+r')(?P<ucletter>[A-Z])', lambda m: m.group('dotlead')+u'{'+m.group('ucletter')+u'}', entry.fields[fld]) if (self.protect_capital_letter_at_begin): for fld in self.protect_capital_letter_at_begin: if fld in entry.fields: entry.fields[fld] = re.sub(r'^(?P<lead>'+_rx_prcap_lead+r')(?P<ucletter>[A-Z])', lambda m: m.group('lead')+u'{'+m.group('ucletter')+u'}', entry.fields[fld]) if (self.fix_mendeley_bug_urls): for fld in self.fix_mendeley_bug_urls: if fld in entry.fields: entry.fields[fld] = do_fix_mendeley_bug_urls(entry.fields[fld]) _rx_dbl_quotes = [ re.compile(r"``(?P<contents>.*?)''"), # this pattern must be tested first, because otherwise we leave stray braces re.compile(r'\{\\textquotedblleft\}(?P<contents>.*?)\{\\textquotedblright\}'), re.compile(r'\\textquotedblleft(?P<contents>.*?)\\textquotedblright'), ] _rx_sgl_quotes = [ # try to match correct quote in " `My dad's dog' is a nice book ". re.compile(r"`(?P<contents>.*?)'(?=\W|$)"), # this pattern must be tested first, because otherwise we leave stray braces re.compile(r'\{\\textquoteleft\}(?P<contents>.*?)\{\\textquoteright\}'), re.compile(r'\\textquoteleft(?P<contents>.*?)\\textquoteright'), ] if (self.convert_dbl_quotes): for fld in self.convert_dbl_quotes: if fld in entry.fields: for rx in _rx_dbl_quotes: entry.fields[fld] = re.sub(rx, lambda m: self.dbl_quote_macro+u"{"+m.group('contents')+u"}", entry.fields[fld]) if (self.convert_sgl_quotes): for fld in self.convert_sgl_quotes: if fld in entry.fields: for rx in _rx_sgl_quotes: entry.fields[fld] = re.sub(rx, lambda m: self.sgl_quote_macro+u"{"+m.group('contents')+u"}", entry.fields[fld]) if (self.remove_file_field): if ('file' in entry.fields): del entry.fields['file'] if (self.remove_fields): for fld in self.remove_fields: entry.fields.pop(fld,None) if (self.remove_doi_prefix): if 'doi' in entry.fields: entry.fields['doi'] = re.sub(r'^\s*doi[ :]\s*', '', entry.fields['doi'], flags=re.IGNORECASE) logger.longdebug("fixes filter, result: %s -> Authors=%r, fields=%r", entry.key, entry.persons.get('author', None), entry.fields) return