def _normalize(self, s): if self.strict: return s else: s = unicodedata.normalize('NFC', latex_to_unicode(s)) # Note: in theory latex_to_unicode also normalizes return s if self.case else s.lower()
def bibent_to_fmt(bibent, fmt): citation_key = bibent['ID'] title = bibent['title'].strip("{}") authors = bibent['author'] # Convert letters accented via LaTeX (e.g., {\'{e}}) to Unicode, so they display in Markdown authors = latex_to_unicode(authors) year = None if 'year' in bibent: year = bibent['year'] authors = authors.replace("{", "") authors = authors.replace("}", "") authros = authors.replace('\n', ' ').replace( '\r', '' ) # sometimes author names are given on separate lines, which breaks the Markdown formatting citation_key = citation_key.replace( "+", "plus" ) # beautiful-jekyll is not that beautiful and doesn't like '+' in footnote names if fmt == "markdown": to_fmt = "[^" + citation_key + "]: **" + title + "**, by " + authors elif fmt == "text": to_fmt = "[" + citation_key + "] " + title + "; by " + authors else: print_error("Unknown format: " + fmt) raise "Internal error" venue = bibent_get_venue(bibent) if venue != None: if fmt == "markdown": to_fmt = to_fmt + ", *in " + venue + "*" elif fmt == "text": to_fmt = to_fmt + "; in " + venue else: print_error("Unknown format: " + fmt) raise "Internal error" if year != None: if fmt == "markdown": to_fmt = to_fmt + ", " + year elif fmt == "text": to_fmt = to_fmt + "; " + year else: print_error("Unknown format: " + fmt) raise "Internal error" url = bibent_get_url(bibent) if url is not None: if fmt == "markdown": mdurl = "[[URL]](" + url + ")" to_fmt = to_fmt + ", " + mdurl elif fmt == "text": to_fmt = to_fmt + "; " + url else: print_error("Unknown format: " + fmt) raise "Internal error" return to_fmt
def convert_to_unicode(record): """ Convert accent from latex to unicode style. :param record: the record. :type record: dict :returns: dict -- the modified record. """ for val in record: if isinstance(record[val], list): record[val] = [latex_to_unicode(x) for x in record[val]] elif isinstance(record[val], dict): record[val] = { k: latex_to_unicode(v) for k, v in record[val].items() } else: record[val] = latex_to_unicode(record[val]) return record
def abbreviate_firstname(name: str, sep: str = " ") -> str: """ Abbreviate first name(s) to initials. For example:: de Geus, Thomas Willem Jan -> de Geus, T. W. J. :param name: The name formatted as "Lastname, firstname secondname ...". :param sep: Separator to place between initials. :return: Formatted name. """ if len(name.split(",")) == 1: return name if len(name.split(",")) > 2: raise OSError(f'Unable to interpret name "{name}"') match = [ (re.compile(r"(.*)(\(.*\))", re.UNICODE), r"\1"), ( re.compile(r"([\w][\}]*)([\w0-9\{\}\`\'\"\\\.\^\{]*)", re.UNICODE), r"\1.", ), (re.compile(r"([\w\.][\-]?)([\ ]*)", re.UNICODE), r"\1"), ] last, first = name.split(",") first = latex_to_unicode(first) first = first.replace(".", ". ").replace("-", "- ").replace(r"\. ", r"\.") + " " names = [ latex_to_unicode(i[0]) for i in re.findall(r"([^\s]*)(\s+)", first) ][1:] for i in range(len(names)): for regex, sub in match: names[i] = re.sub(regex, sub, names[i]) return last + ", " + sep.join([rm_unicode(i) for i in names]).upper()
def convert_to_unicode(record): """ Convert accent from latex to unicode style. :param record: the record. :type record: dict :returns: dict -- the modified record. """ for val in record: if isinstance(record[val], list): record[val] = [ latex_to_unicode(x) for x in record[val] ] elif isinstance(record[val], dict): record[val] = { k: latex_to_unicode(v) for k, v in record[val].items() } else: record[val] = latex_to_unicode(record[val]) return record
def decode(entry): if isinstance(entry, list): return [decode(e) for e in entry] elif isinstance(entry, dict): return dict((k, decode(v)) for k, v in entry.items()) else: # TODO: latex_to_unicode sometimes fails with exception. I do # not understand why, but let's just sweep it under the rug # for now ok? Great. try: return latex_to_unicode(entry) except Exception: return entry
def prepare_string(x, max_length=None): """ Converts a string from LaTeX escapes to UTF8 and truncates it to max_length """ # data = latex2text(x, tolerant_parsing=True) try: data = latex_to_unicode(filter_using_re(x)) if max_length is not None: data = (data[:max_length - 5] + '[...]') if len(data) > max_length else data return smart_text(data) except TypeError: logger.warning( "Encountered a TypeError which may be linked to unicode handling " "in bibtexparser when processing the following string: %s." % x) return ""
def bibfile_latex_to_unicode(bibtex_fname): parser = BibTexParser(common_strings=True) with open(bibtex_fname) as bibtex_file: bibdb = bibtexparser.load(bibtex_file, parser=parser) for i, entry in enumerate(bibdb.entries): delete_field(bibdb, i, 'file') for field in entry.keys(): bibdb.entries[i][field] = latex_to_unicode(entry[field]) bibdb.comments = [] writer = BibTexWriter() writer.display_order = ['title', 'year', 'author', 'journal', 'booktitle'] clean_file = writer.write(bibdb) # Use for debug purposes: # with open('tmp.bib','w') as f: # f.write(clean_file) return clean_file
def latex_to_ascii(tex): r""" Transforms LaTeX strings to ascii text ignoring accents Args: tex (str): LaTeX string Returns: str: unicode string containing only ascii characters Examples: >>> latex_to_ascii(r"\^ile") 'ile' >>> latex_to_ascii(r"\^ile") == latex_to_ascii('île') True >>> latex_to_ascii(r"Bartoszy\'nski Ros\l anowski") 'Bartoszynski Rosl anowski' """ for pat, sub in latex_to_ascii.dict.items(): tex = tex.replace(pat, sub) uni = latex_to_unicode(tex) asc = unicodedata.normalize('NFD', uni) asc = asc.encode('ascii', 'ignore').decode('utf-8') return asc
def test_ignores_trailing_modifier(self): string = "a\\\'" result = latex_to_unicode(string) expected = 'a' self.assertEqual(result, expected)
def test_accents(self): string = "{\`a} {\\\'e} {\`e} {\\\"o}" result = latex_to_unicode(string) expected = 'à é è ö' self.assertEqual(result, expected)
def test_special_caracter(self): string = '{\c c}' result = latex_to_unicode(string) expected = 'ç' self.assertEqual(result, expected)
def test_does_not_modify_two_existing_combining(self): string = b'pho\xcc\x9b\xcc\x89'.decode('utf8') result = latex_to_unicode(string) expected = 'phở' # normalized self.assertEqual(result, expected)
def p2b(pmidlist): ''' by Nick Loman ''' if type(pmidlist) != list: pmidlist = [str(pmidlist)] ## Fetch XML data from Entrez. efetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' url = '{}?db=pubmed&id={}&rettype=abstract'.format(efetch, ','.join(pmidlist)) try: r = requests.get(url) except: return [] ##print(r.text) # to examine the returned xml ## Loop over the PubMed IDs and parse the XML using https://docs.python.org/2/library/xml.etree.elementtree.html bibout = [] root = ET.fromstring(r.text) for PubmedArticle in root.iter('PubmedArticle'): PMID = PubmedArticle.find('./MedlineCitation/PMID') ISSN = PubmedArticle.find('./MedlineCitation/Article/Journal/ISSN') Volume = PubmedArticle.find( './MedlineCitation/Article/Journal/JournalIssue/Volume') Issue = PubmedArticle.find( './MedlineCitation/Article/Journal/JournalIssue/Issue') Year = PubmedArticle.find( './MedlineCitation/Article/Journal/JournalIssue/PubDate/Year') Month = PubmedArticle.find( './MedlineCitation/Article/Journal/JournalIssue/PubDate/Month') Title = PubmedArticle.find('./MedlineCitation/Article/Journal/Title') ArticleTitle = PubmedArticle.find( './MedlineCitation/Article/ArticleTitle') MedlinePgn = PubmedArticle.find( './MedlineCitation/Article/Pagination/MedlinePgn') Abstract = PubmedArticle.find( './MedlineCitation/Article/Abstract/AbstractText') # jkb additions PMCID = None DOI = None theseids = PubmedArticle.findall( './PubmedData/ArticleIdList/ArticleId') for thisid in theseids: if thisid.attrib['IdType'] == 'pmc': PMCID = thisid elif thisid.attrib['IdType'] == 'doi': DOI = thisid # format author list authors = [] for Author in PubmedArticle.iter('Author'): try: LastName = Author.find('LastName').text ForeName = Author.find('ForeName').text except AttributeError: # e.g. CollectiveName continue authors.append('{}, {}'.format(LastName, ForeName)) ## Use InvestigatorList instead of AuthorList if len(authors) == 0: ## './MedlineCitation/Article/Journal/InvestigatorList' for Investigator in PubmedArticle.iter('Investigator'): try: LastName = Investigator.find('LastName').text ForeName = Investigator.find('ForeName').text except AttributeError: # e.g. CollectiveName continue authors.append('{}, {}'.format(LastName, ForeName)) if Year is None: _ = PubmedArticle.find( './MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate' ) Year = _.text[:4] Month = '{:02d}'.format( list(calendar.month_abbr).index(_.text[5:8])) else: Year = Year.text if Month is not None: Month = Month.text ''' try: for _ in (PMID.text, Volume.text, Title.text, ArticleTitle.text, MedlinePgn.text, Abstract.text, ''.join(authors)): if _ is None: continue assert '{' not in _, _ assert '}' not in _, _ except AttributeError: pass ''' # make the bibtex formatted output. bib = {} if len(authors) > 0: authorname = authors[0].split(',')[0] else: authorname = '' titlewords = [x for x in ArticleTitle.text.split(' ') if len(x) > 3] if len(titlewords) > 2: titlestring = ''.join(titlewords[:3]) elif len(titlewords) > 0: titlestring = ''.join(titlewords) else: titlestring = '' if len(authorname + titlestring) == 0: titlestring = "PMID{}_".format(PMID.text) new_id = '{}{}{}'.format(authorname, titlestring, Year) new_id = re.sub(r'\W+', '', new_id) bib["ID"] = latexchars.replace_accents(new_id) bib["Author"] = ' and '.join(authors) bib["Title"] = ArticleTitle.text bib["Journal"] = Title.text bib["Year"] = Year if Volume is not None: bib["Volume"] = Volume.text if Issue is not None: bib["Number"] = Issue.text if MedlinePgn is not None: bib["Pages"] = MedlinePgn.text if Month is not None: bib["Month"] = Month # bib[""] = (' Abstract={{{}}},'.format(Abstract.text)) if PMCID is not None: bib["pmcid"] = PMCID.text if DOI is not None: bib["doi"] = DOI.text if ISSN is not None: bib["ISSN"] = ISSN.text bib["pmid"] = PMID.text # always return clean latex bib = {d: latex_to_unicode(bib[d]) for d in bib.keys()} bibout.append(bib) return bibout