Example #1
0
File: query.py Project: rynxr/pubs
 def _normalize(self, s):
     if self.strict:
         return s
     else:
         s = unicodedata.normalize('NFC', latex_to_unicode(s))
         # Note: in theory latex_to_unicode also normalizes
         return s if self.case else s.lower()
Example #2
0
File: query.py Project: pubs/pubs
 def _normalize(self, s):
     if self.strict:
         return s
     else:
         s = unicodedata.normalize('NFC', latex_to_unicode(s))
         # Note: in theory latex_to_unicode also normalizes
         return s if self.case else s.lower()
Example #3
0
def bibent_to_fmt(bibent, fmt):
    citation_key = bibent['ID']
    title = bibent['title'].strip("{}")
    authors = bibent['author']
    # Convert letters accented via LaTeX (e.g., {\'{e}}) to Unicode, so they display in Markdown
    authors = latex_to_unicode(authors)

    year = None
    if 'year' in bibent:
        year = bibent['year']
    authors = authors.replace("{", "")
    authors = authors.replace("}", "")
    authros = authors.replace('\n', ' ').replace(
        '\r', ''
    )  # sometimes author names are given on separate lines, which breaks the Markdown formatting
    citation_key = citation_key.replace(
        "+", "plus"
    )  # beautiful-jekyll is not that beautiful and doesn't like '+' in footnote names

    if fmt == "markdown":
        to_fmt = "[^" + citation_key + "]: **" + title + "**, by " + authors
    elif fmt == "text":
        to_fmt = "[" + citation_key + "] " + title + "; by " + authors
    else:
        print_error("Unknown format: " + fmt)
        raise "Internal error"

    venue = bibent_get_venue(bibent)
    if venue != None:
        if fmt == "markdown":
            to_fmt = to_fmt + ", *in " + venue + "*"
        elif fmt == "text":
            to_fmt = to_fmt + "; in " + venue
        else:
            print_error("Unknown format: " + fmt)
            raise "Internal error"

    if year != None:
        if fmt == "markdown":
            to_fmt = to_fmt + ", " + year
        elif fmt == "text":
            to_fmt = to_fmt + "; " + year
        else:
            print_error("Unknown format: " + fmt)
            raise "Internal error"

    url = bibent_get_url(bibent)
    if url is not None:
        if fmt == "markdown":
            mdurl = "[[URL]](" + url + ")"
            to_fmt = to_fmt + ", " + mdurl
        elif fmt == "text":
            to_fmt = to_fmt + "; " + url
        else:
            print_error("Unknown format: " + fmt)
            raise "Internal error"

    return to_fmt
Example #4
0
def convert_to_unicode(record):
    """
    Convert accent from latex to unicode style.

    :param record: the record.
    :type record: dict
    :returns: dict -- the modified record.
    """
    for val in record:
        if isinstance(record[val], list):
            record[val] = [latex_to_unicode(x) for x in record[val]]
        elif isinstance(record[val], dict):
            record[val] = {
                k: latex_to_unicode(v)
                for k, v in record[val].items()
            }
        else:
            record[val] = latex_to_unicode(record[val])
    return record
Example #5
0
def abbreviate_firstname(name: str, sep: str = " ") -> str:
    """
    Abbreviate first name(s) to initials.

    For example::

        de Geus, Thomas Willem Jan ->
        de Geus, T. W. J.

    :param name: The name formatted as "Lastname, firstname secondname ...".
    :param sep: Separator to place between initials.
    :return: Formatted name.
    """

    if len(name.split(",")) == 1:
        return name

    if len(name.split(",")) > 2:
        raise OSError(f'Unable to interpret name "{name}"')

    match = [
        (re.compile(r"(.*)(\(.*\))", re.UNICODE), r"\1"),
        (
            re.compile(r"([\w][\}]*)([\w0-9\{\}\`\'\"\\\.\^\{]*)", re.UNICODE),
            r"\1.",
        ),
        (re.compile(r"([\w\.][\-]?)([\ ]*)", re.UNICODE), r"\1"),
    ]

    last, first = name.split(",")
    first = latex_to_unicode(first)
    first = first.replace(".", ". ").replace("-", "- ").replace(r"\. ",
                                                                r"\.") + " "
    names = [
        latex_to_unicode(i[0]) for i in re.findall(r"([^\s]*)(\s+)", first)
    ][1:]

    for i in range(len(names)):
        for regex, sub in match:
            names[i] = re.sub(regex, sub, names[i])

    return last + ", " + sep.join([rm_unicode(i) for i in names]).upper()
def convert_to_unicode(record):
    """
    Convert accent from latex to unicode style.

    :param record: the record.
    :type record: dict
    :returns: dict -- the modified record.
    """
    for val in record:
        if isinstance(record[val], list):
            record[val] = [
                latex_to_unicode(x) for x in record[val]
            ]
        elif isinstance(record[val], dict):
            record[val] = {
                k: latex_to_unicode(v) for k, v in record[val].items()
            }
        else:
            record[val] = latex_to_unicode(record[val])
    return record
 def decode(entry):
     if isinstance(entry, list):
         return [decode(e) for e in entry]
     elif isinstance(entry, dict):
         return dict((k, decode(v)) for k, v in entry.items())
     else:
         # TODO: latex_to_unicode sometimes fails with exception. I do
         # not understand why, but let's just sweep it under the rug
         # for now ok? Great.
         try:
             return latex_to_unicode(entry)
         except Exception:
             return entry
Example #8
0
def prepare_string(x, max_length=None):
    """ Converts a string from LaTeX escapes to UTF8 and truncates it to max_length """
    # data = latex2text(x, tolerant_parsing=True)
    try:
        data = latex_to_unicode(filter_using_re(x))
        if max_length is not None:
            data = (data[:max_length - 5] +
                    '[...]') if len(data) > max_length else data
        return smart_text(data)
    except TypeError:
        logger.warning(
            "Encountered a TypeError which may be linked to unicode handling "
            "in bibtexparser when processing the following string: %s." % x)
    return ""
Example #9
0
def bibfile_latex_to_unicode(bibtex_fname):
    parser = BibTexParser(common_strings=True)
    with open(bibtex_fname) as bibtex_file:
        bibdb = bibtexparser.load(bibtex_file, parser=parser)
    for i, entry in enumerate(bibdb.entries):
        delete_field(bibdb, i, 'file')
        for field in entry.keys():
            bibdb.entries[i][field] = latex_to_unicode(entry[field])
    bibdb.comments = []
    writer = BibTexWriter()
    writer.display_order = ['title', 'year', 'author', 'journal', 'booktitle']
    clean_file = writer.write(bibdb)
    # Use for debug purposes:
    # with open('tmp.bib','w') as f:
    #     f.write(clean_file)
    return clean_file
Example #10
0
def latex_to_ascii(tex):
    r""" Transforms LaTeX strings to ascii text ignoring accents

    Args:
        tex (str): LaTeX string

    Returns:
        str: unicode string containing only ascii characters

    Examples:
        >>> latex_to_ascii(r"\^ile")
        'ile'
        >>> latex_to_ascii(r"\^ile") == latex_to_ascii('île')
        True
        >>> latex_to_ascii(r"Bartoszy\'nski Ros\l anowski")
        'Bartoszynski Rosl anowski'
    """
    for pat, sub in latex_to_ascii.dict.items():
        tex = tex.replace(pat, sub)

    uni = latex_to_unicode(tex)
    asc = unicodedata.normalize('NFD', uni)
    asc = asc.encode('ascii', 'ignore').decode('utf-8')
    return asc
Example #11
0
 def test_ignores_trailing_modifier(self):
     string = "a\\\'"
     result = latex_to_unicode(string)
     expected = 'a'
     self.assertEqual(result, expected)
 def test_accents(self):
     string = "{\`a} {\\\'e} {\`e} {\\\"o}"
     result = latex_to_unicode(string)
     expected = 'à é è ö'
     self.assertEqual(result, expected)
 def test_ignores_trailing_modifier(self):
     string = "a\\\'"
     result = latex_to_unicode(string)
     expected = 'a'
     self.assertEqual(result, expected)
 def test_special_caracter(self):
     string = '{\c c}'
     result = latex_to_unicode(string)
     expected = 'ç'
     self.assertEqual(result, expected)
 def test_does_not_modify_two_existing_combining(self):
     string = b'pho\xcc\x9b\xcc\x89'.decode('utf8')
     result = latex_to_unicode(string)
     expected = 'phở'  # normalized
     self.assertEqual(result, expected)
Example #16
0
 def test_accents(self):
     string = "{\`a} {\\\'e} {\`e} {\\\"o}"
     result = latex_to_unicode(string)
     expected = 'à é è ö'
     self.assertEqual(result, expected)
Example #17
0
 def test_does_not_modify_two_existing_combining(self):
     string = b'pho\xcc\x9b\xcc\x89'.decode('utf8')
     result = latex_to_unicode(string)
     expected = 'phở'  # normalized
     self.assertEqual(result, expected)
Example #18
0
 def test_special_caracter(self):
     string = '{\c c}'
     result = latex_to_unicode(string)
     expected = 'ç'
     self.assertEqual(result, expected)
Example #19
0
def p2b(pmidlist):
    ''' by Nick Loman '''

    if type(pmidlist) != list:
        pmidlist = [str(pmidlist)]

    ## Fetch XML data from Entrez.
    efetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    url = '{}?db=pubmed&id={}&rettype=abstract'.format(efetch,
                                                       ','.join(pmidlist))
    try:
        r = requests.get(url)
    except:
        return []
    ##print(r.text) # to examine the returned xml
    ## Loop over the PubMed IDs and parse the XML using https://docs.python.org/2/library/xml.etree.elementtree.html
    bibout = []
    root = ET.fromstring(r.text)
    for PubmedArticle in root.iter('PubmedArticle'):
        PMID = PubmedArticle.find('./MedlineCitation/PMID')
        ISSN = PubmedArticle.find('./MedlineCitation/Article/Journal/ISSN')
        Volume = PubmedArticle.find(
            './MedlineCitation/Article/Journal/JournalIssue/Volume')
        Issue = PubmedArticle.find(
            './MedlineCitation/Article/Journal/JournalIssue/Issue')
        Year = PubmedArticle.find(
            './MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
        Month = PubmedArticle.find(
            './MedlineCitation/Article/Journal/JournalIssue/PubDate/Month')
        Title = PubmedArticle.find('./MedlineCitation/Article/Journal/Title')
        ArticleTitle = PubmedArticle.find(
            './MedlineCitation/Article/ArticleTitle')
        MedlinePgn = PubmedArticle.find(
            './MedlineCitation/Article/Pagination/MedlinePgn')
        Abstract = PubmedArticle.find(
            './MedlineCitation/Article/Abstract/AbstractText')
        # jkb additions
        PMCID = None
        DOI = None
        theseids = PubmedArticle.findall(
            './PubmedData/ArticleIdList/ArticleId')
        for thisid in theseids:
            if thisid.attrib['IdType'] == 'pmc':
                PMCID = thisid
            elif thisid.attrib['IdType'] == 'doi':
                DOI = thisid
        # format author list
        authors = []
        for Author in PubmedArticle.iter('Author'):
            try:
                LastName = Author.find('LastName').text
                ForeName = Author.find('ForeName').text
            except AttributeError:  # e.g. CollectiveName
                continue
            authors.append('{}, {}'.format(LastName, ForeName))
        ## Use InvestigatorList instead of AuthorList
        if len(authors) == 0:
            ## './MedlineCitation/Article/Journal/InvestigatorList'
            for Investigator in PubmedArticle.iter('Investigator'):
                try:
                    LastName = Investigator.find('LastName').text
                    ForeName = Investigator.find('ForeName').text
                except AttributeError:  # e.g. CollectiveName
                    continue
                authors.append('{}, {}'.format(LastName, ForeName))
        if Year is None:
            _ = PubmedArticle.find(
                './MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate'
            )
            Year = _.text[:4]
            Month = '{:02d}'.format(
                list(calendar.month_abbr).index(_.text[5:8]))
        else:
            Year = Year.text
            if Month is not None:
                Month = Month.text
        '''
        try:
            for _ in (PMID.text, Volume.text, Title.text, ArticleTitle.text, MedlinePgn.text, Abstract.text, ''.join(authors)):
                if _ is None:
                    continue
                assert '{' not in _, _
                assert '}' not in _, _
        except AttributeError:
            pass
        '''

        # make the bibtex formatted output.
        bib = {}
        if len(authors) > 0:
            authorname = authors[0].split(',')[0]
        else:
            authorname = ''
        titlewords = [x for x in ArticleTitle.text.split(' ') if len(x) > 3]
        if len(titlewords) > 2:
            titlestring = ''.join(titlewords[:3])
        elif len(titlewords) > 0:
            titlestring = ''.join(titlewords)
        else:
            titlestring = ''
        if len(authorname + titlestring) == 0:
            titlestring = "PMID{}_".format(PMID.text)
        new_id = '{}{}{}'.format(authorname, titlestring, Year)
        new_id = re.sub(r'\W+', '', new_id)
        bib["ID"] = latexchars.replace_accents(new_id)
        bib["Author"] = ' and '.join(authors)
        bib["Title"] = ArticleTitle.text
        bib["Journal"] = Title.text
        bib["Year"] = Year
        if Volume is not None:
            bib["Volume"] = Volume.text
        if Issue is not None:
            bib["Number"] = Issue.text
        if MedlinePgn is not None:
            bib["Pages"] = MedlinePgn.text
        if Month is not None:
            bib["Month"] = Month
        # bib[""] = (' Abstract={{{}}},'.format(Abstract.text))
        if PMCID is not None:
            bib["pmcid"] = PMCID.text
        if DOI is not None:
            bib["doi"] = DOI.text
        if ISSN is not None:
            bib["ISSN"] = ISSN.text
        bib["pmid"] = PMID.text
        # always return clean latex
        bib = {d: latex_to_unicode(bib[d]) for d in bib.keys()}
        bibout.append(bib)
    return bibout