Beispiel #1
0
 def test_basic_2b(self):
     u = UnicodeToLatexEncoder(replacement_latex_protection='none')
     input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%."
     self.assertEqual(
         u.unicode_to_latex(input),
         "''\\`A votre sant\\'e!'' s'exclama le ma\\^\\itre de maison \\`a 100\\%."
     )
Beispiel #2
0
 def test_rules_03(self):
     u = UnicodeToLatexEncoder(conversion_rules=['unicode-xml'])
     input = "* \"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama\N{SUPERSCRIPT TWO} le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%."
     self.assertEqual(
         u.unicode_to_latex(input),
         "{\\ast} \"\\`{A} votre sant\\'{e}!\" s{\\textquotesingle}exclama{^2} le ma\\^{\\i}tre de maison \\`{a} 100\\%."
     )
Beispiel #3
0
 def test_basic_2d(self):
     u = UnicodeToLatexEncoder(non_ascii_only=False)
     ascii_chars_convert = " \" # $ % & \\ _ { } ~ "
     self.assertEqual(
         u.unicode_to_latex(ascii_chars_convert),
         " '' \\# \\$ \\% \\& {\\textbackslash} \\_ \\{ \\} {\\textasciitilde} "
     )
Beispiel #4
0
    def test_rules_01(self):
        def acallable(s, pos):
            if s[pos] == "\N{LATIN SMALL LETTER E WITH ACUTE}":
                return (1, r"{\'{e}}")
            if s.startswith('...', pos):
                return (3, r"\ldots")
            return None

        u = UnicodeToLatexEncoder(conversion_rules=[
            latexencode.UnicodeToLatexConversionRule(
                latexencode.RULE_DICT, {
                    ord("\N{LATIN CAPITAL LETTER A WITH GRAVE}"): r"{{\`{A}}}",
                    ord("%"): r"\textpercent",
                }),
            latexencode.UnicodeToLatexConversionRule(latexencode.RULE_REGEX, [
                (re.compile('v(otre)'), r'n\1'),
                (re.compile("s'exclama", flags=re.I), r"s'exprima"),
                (re.compile('\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'),
                 r"{\^i}"),
            ]),
            'unicode-xml',  # expand built-in rule names
            latexencode.UnicodeToLatexConversionRule(latexencode.RULE_CALLABLE,
                                                     acallable),
        ])
        input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison ... \N{LATIN SMALL LETTER A WITH GRAVE} 100%."
        self.assertEqual(
            u.unicode_to_latex(input),
            "\"{{\\`{A}}} notre sant\\'{e}!\" s'exprima le ma{\\^i}tre de maison {\\ldots} \\`{a} 100{\\textpercent}."
        )
Beispiel #5
0
    def escape_special_chars(self, suppress_warnings: bool = True) -> None:
        """Escapes special characters in the bibliographic data.

        Special characters should be escaped to ensure proper rendering in LaTeX documents. This
        function leverages the existing implementation of the `pylatexenc` module to do said
        conversion. The only fields exempted from the conversion are the `file` and `url` fields of
        the `Entry.data` dictionary.

        Args:
            suppress_warnings: if True, warnings generated by the `pylatexenc` modules will be
                suppressed. This argument will be overwritten if the logging level is set to
                `logging.DEBUG`.
        """
        enc = UnicodeToLatexEncoder(
            non_ascii_only=True,
            replacement_latex_protection="braces-all",
            unknown_char_policy="keep",
            unknown_char_warning=not suppress_warnings
            or LOGGER.isEnabledFor(logging.DEBUG),
        )
        for key, value in self.data.items():
            if key in ("file", "url"):
                # do NOT these fields and keep any special characters
                self.data[key] = value
                continue
            if isinstance(value, str):
                self.data[key] = enc.unicode_to_latex(value)
Beispiel #6
0
 def test_basic_1(self):
     u = UnicodeToLatexEncoder(non_ascii_only=True,
                               replacement_latex_protection='braces-all')
     input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%."
     self.assertEqual(
         u.unicode_to_latex(input),
         "\"{\\`A} votre sant{\\'e}!\" s'exclama le ma{\\^\\i}tre de maison {\\`a} 100%."
     )
Beispiel #7
0
 def test_basic_3b(self):
     test_unknown_chars = "A unicode character: \N{THAI CHARACTER THO THONG}"
     # generates warnings -- that's good
     with self.assertLogs(logger='pylatexenc.latexencode',
                          level='WARNING') as cm:
         u = UnicodeToLatexEncoder(unknown_char_policy='replace')
         self.assertEqual(u.unicode_to_latex(test_unknown_chars),
                          "A unicode character: {\\bfseries ?}")
Beispiel #8
0
 def test_basic_callable_replacement_latex_protection(self):
     u = UnicodeToLatexEncoder(
         replacement_latex_protection=lambda s: '{***{' + s + '}***}')
     input = "\"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%."
     self.assertEqual(
         u.unicode_to_latex(input),
         "{***{''}***}{***{\\`A}***} votre sant{***{\\'e}***}!{***{''}***} s'exclama le ma{***{\\^\\i}***}tre de maison {***{\\`a}***} 100{***{\\%}***}."
     )
Beispiel #9
0
 def test_rules_02(self):
     # based on test_basic_0()
     u = UnicodeToLatexEncoder(conversion_rules=['defaults'])
     #u = UnicodeToLatexEncoder()
     input = "* \"\N{LATIN CAPITAL LETTER A WITH GRAVE} votre sant\N{LATIN SMALL LETTER E WITH ACUTE}!\" s'exclama\N{SUPERSCRIPT TWO} le ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre de maison \N{LATIN SMALL LETTER A WITH GRAVE} 100%."
     self.assertEqual(
         u.unicode_to_latex(input),
         "* ''\\`A votre sant\\'e!'' s'exclama{\\texttwosuperior} le ma{\\^\\i}tre de maison \\`a 100\\%."
     )
Beispiel #10
0
    def test_basic_3c(self):
        test_unknown_chars = "A unicode character: \N{THAI CHARACTER THO THONG}"
        u = UnicodeToLatexEncoder(unknown_char_policy='unihex',
                                  unknown_char_warning=False)

        self.assertEqual(
            u.unicode_to_latex(test_unknown_chars),
            "A unicode character: \\ensuremath{\\langle}\\texttt{U+0E18}\\ensuremath{\\rangle}"
        )
Beispiel #11
0
 def test_basic_2a(self):
     # Issue #44
     u = UnicodeToLatexEncoder(
         replacement_latex_protection='braces-after-macro')
     input = "Jabłoński, François, ⟨.⟩, ~"
     self.assertEqual(
         u.unicode_to_latex(input),
         "Jab\\l{}o\\'nski, Fran\\c{c}ois, \\ensuremath{\\langle}.\\ensuremath{\\rangle}, \\textasciitilde{}"
     )
Beispiel #12
0
 def __init__(self):
     conversion_rules = [
         # our custom rules
         UnicodeToLatexConversionRule(
             RULE_REGEX,
             [
                 # double \\ needed, see UnicodeToLatexConversionRule
                 (re.compile(r'\u1ec5'), r'\\~{\\^{{e}}}'),
             ]),
         # plus all the default rules
         'defaults'
     ]
     self.u = UnicodeToLatexEncoder(
         conversion_rules=conversion_rules,
         replacement_latex_protection='braces-almost-all')
Beispiel #13
0
    def test_latex_string_class(self):
        class LatexChunkList:
            def __init__(self):
                self.chunks = []

            def __iadd__(self, s):
                self.chunks.append(s)
                return self

        u = UnicodeToLatexEncoder(latex_string_class=LatexChunkList,
                                  replacement_latex_protection='none')
        result = u.unicode_to_latex("A é → α")
        # result is an object of custom type LatexChunkList
        self.assertEqual(result.chunks, [
            'A', ' ', r'\'e', ' ', r'\textrightarrow', ' ',
            r'\ensuremath{\alpha}'
        ])
def latex_encoder():
    u = UnicodeToLatexEncoder(
        conversion_rules=[
            UnicodeToLatexConversionRule(rule_type=RULE_REGEX, rule=extra_rules()),
            'defaults'
        ]
    )
    return u.unicode_to_latex
Beispiel #15
0
    def test_issue_no21(self):
        # test for https://github.com/phfaist/pylatexenc/issues/21

        def capitalize_acronyms(s, pos):
            if s[pos] in ('{', '}'):
                # preserve existing braces
                return (1, s[pos])
            m = re.compile(r'\b[A-Z]{2,}\w*\b').match(s, pos)
            if m is None:
                return None
            return (m.end() - m.start(), "{" + m.group() + "}")

        u = UnicodeToLatexEncoder(conversion_rules=[
            latexencode.UnicodeToLatexConversionRule(latexencode.RULE_CALLABLE,
                                                     capitalize_acronyms),
        ] + latexencode.get_builtin_conversion_rules('defaults'))
        input = "Title with {Some} ABC acronyms LIKe this."
        self.assertEqual(u.unicode_to_latex(input),
                         "Title with {Some} {ABC} acronyms {LIKe} this.")

        u = UnicodeToLatexEncoder(conversion_rules=[
            latexencode.UnicodeToLatexConversionRule(
                latexencode.RULE_REGEX,
                [
                    (re.compile(r'([{}])'), r'\1'),  # keep existing braces
                    (re.compile(r'\b([A-Z]{2,}\w*)\b'), r'{\1}'),
                ]),
        ] + latexencode.get_builtin_conversion_rules('defaults'))
        input = "Title with {Some} ABC acronyms LIKe this."
        self.assertEqual(u.unicode_to_latex(input),
                         "Title with {Some} {ABC} acronyms {LIKe} this.")
Beispiel #16
0
    def escape_special_chars(self, suppress_warnings=True):
        """Escapes special characters.

        Special characters should be escaped to ensure proper rendering in LaTeX documents. This
        function leverages the existing implementation of the pylatexenc module.

        Args:
            suppress_warnings (bool): if True, suppresses warnings.
        """
        enc = UnicodeToLatexEncoder(
            non_ascii_only=True,
            replacement_latex_protection='braces-all',
            unknown_char_policy='keep',
            unknown_char_warning=not suppress_warnings
            or LOGGER.isEnabledFor(10))  # 10 = DEBUG logging level
        for key, value in self.data.items():
            if key in ('ID', 'file'):
                # do NOT these fields and keep any special characters
                self.data[key] = value
                continue
            if isinstance(value, str):
                self.data[key] = enc.unicode_to_latex(value)
Beispiel #17
0
class BibtexWriter(Writer):
    latex_encode = UnicodeToLatexEncoder(
        replacement_latex_protection="braces-after-macro",
        non_ascii_only=True).unicode_to_latex

    def _encode(self, text):
        return self.latex_encode(text)

    def _write_persons(self, stream, persons, role):
        if len(persons) > 10:
            self._write_field(
                stream, role,
                self._format_name(stream, persons[0]) + " and others")
        else:
            super(BibtexWriter, self)._write_persons(stream, persons, role)
Beispiel #18
0
def latex_encode(text, contains_math=False):
    """Encode a string for use in a LaTeX format.

    Args:
        contains_math (bool): when True, math environments delimited by $...$
        or \\(...\\) are preserved to avoid double escaping. Note that $$...$$
        is not handled.
    """
    if text is None:
        return None

    encode = UnicodeToLatexEncoder(
        replacement_latex_protection="braces-after-macro").unicode_to_latex

    if not (contains_math and ("$" in text or r"\(" in text)):
        return encode(text)

    parts = MATH_EXPRESSION_REGEX.split(text)
    encoded_text = "".join(
        encode(part) if i % 2 == 0 else part for i, part in enumerate(parts))

    return encoded_text
import os

import time

import locale

import json

import argparse

from tika import parser

from pylatexenc.latexencode import UnicodeToLatexConversionRule, UnicodeToLatexEncoder, RULE_REGEX

encoder = UnicodeToLatexEncoder(conversion_rules=[
    UnicodeToLatexConversionRule(RULE_REGEX, []), 'defaults'
])

config_path = os.path.join(os.getcwd(), "config.json")

config_template = {
    "title": {},
    "authors": [],
    "packages": [],
    "commands": {},
    "environments": {}
}

tex_project_template = \
"""
\\documentclass[12pt]{{article}}
Beispiel #20
0
# type hints
from typing import (
    Optional,
    Dict,
    List,
    Any,
    Union,
    TypeVar,
    Generic,
    Tuple,
    Set
)

encoder: UnicodeToLatexEncoder = \
    UnicodeToLatexEncoder(unknown_char_policy='replace',
                          replacement_latex_protection="braces",
                          non_ascii_only=True)


@dataclass
class XMLItem(ABC):
    """
    Base XML wrapper class. This item consists on a dataclass
    with basically two fields:
     + `tag`, containing the XML tag identifier.
     + `item_tag`, containing the XML tag itself.

    This abstract class defines two abstract methods that must be
    override:
     - :func:`parse`
     - :func:`to_table`
Beispiel #21
0
 def test_basic_2c(self):
     u = UnicodeToLatexEncoder(non_ascii_only=True)
     ascii_chars_convert = " \" # $ % & \\ _ { } ~ "
     self.assertEqual(u.unicode_to_latex(ascii_chars_convert),
                      ascii_chars_convert)
    def test_all(self):

        loglevel = logging.getLogger().level
        logging.getLogger().setLevel(logging.CRITICAL)

        u = UnicodeToLatexEncoder(
            unknown_char_policy='fail',
            replacement_latex_protection='braces-almost-all')

        def fn(x,
               bdir=os.path.realpath(os.path.abspath(
                   os.path.dirname(__file__)))):
            return os.path.join(bdir, x)

        with codecs.open(fn('_tmp_uni_chars_test.temp.txt'),
                         'w',
                         encoding='utf-8') as testf:

            for i in range(0x10FFFF):
                # iter over all valid unicode characters
                try:
                    chrname = unicodedata.name(unichr(
                        i))  # test if valid, i.e., it has a UNICODE NAME
                except ValueError:
                    continue

                line = "0x%04X %-50s    |%s|\n" % (i, '[' + chrname + ']',
                                                   unichr(i))

                # try to encode it using our unicode_to_latex routines
                try:
                    enc = u.unicode_to_latex(line)
                except ValueError:
                    continue
                testf.write(enc)

        with codecs.open(fn('uni_chars_test_previous.txt'), 'r', encoding='utf-8') as reff, \
             codecs.open(fn('_tmp_uni_chars_test.temp.txt'), 'r', encoding='utf-8') as testf:
            a = reff.readlines()
            b = testf.readlines()

        logging.getLogger().setLevel(loglevel)
        logger = logging.getLogger(__name__)

        # only check up to the supported unicode range
        if sys.maxunicode < 0x10FFFF:
            logger.warning(
                "Only checking up to unicode U+%X, your python build doesn't support higher",
                sys.maxunicode)
            afiltered = [
                aline for aline in a
                if int(aline[:aline.find(' ')], 0) < sys.maxunicode
            ]
            a = afiltered

        s = difflib.unified_diff(a,
                                 b,
                                 fromfile='uni_chars_test_previous.txt',
                                 tofile='_tmp_uni_chars_test.temp.txt')
        diffmsg = "".join(list(s)).strip()
        if diffmsg:
            print(diffmsg)
            raise self.failureException(
                "Unicode coverage tests failed. See full diff above.")