Esempio n. 1
0
def test_find_pattern():
    pattern = Pattern("letter", r"[a-z]")
    match = next(scan_tex("a", [pattern]))
    assert match.start == 0
    assert match.end == 1
    assert match.pattern.name == "letter"
    assert match.text == "a"
Esempio n. 2
0
 def parse(self, tex: str) -> Optional[BeginDocument]:
     pattern = Pattern("begin_document", r"\\begin{document}")
     scanner = scan_tex(tex, [pattern], include_unmatched=False)
     try:
         match = next(scanner)
         return BeginDocument(match.start, match.end)
     except StopIteration:
         return None
Esempio n. 3
0
 def parse(self, tex: str) -> Iterator[LengthAssignment]:
     parameter_names_pattern = (
         r"(?:" + "|".join([r"\\" + p for p in ARRAY_PARAMETERS]) + ")")
     unit_pattern = r"(?:" + "|".join(LENGTH_UNITS) + ")"
     assignment_pattern = (parameter_names_pattern + r"\s*=\s*[0-9\.]+\s*" +
                           unit_pattern)
     pattern = Pattern("length_assignment", assignment_pattern)
     scanner = scan_tex(tex, [pattern])
     for match in scanner:
         yield LengthAssignment(match.start, match.end)
Esempio n. 4
0
    def parse(self, tex_path: str, tex: str) -> Iterator[Equation]:

        self._stack: List[Match] = []  # pylint: disable=attribute-defined-outside-init
        self._tex = tex  # pylint: disable=attribute-defined-outside-init
        self._tex_path = tex_path  # pylint: disable=attribute-defined-outside-init
        self._equation_index = 0  # pylint: disable=attribute-defined-outside-init

        scanner = scan_tex(tex, self.PATTERNS)
        for match in scanner:
            for equation in self._process_token(match):
                yield equation
Esempio n. 5
0
def test_find_multiple():
    scanner = scan_tex("<.>", [Pattern("start", r"<"), Pattern("end", r">")])

    match1 = next(scanner)
    assert match1.start == 0
    assert match1.end == 1
    assert match1.pattern.name == "start"

    match2 = next(scanner)
    assert match2.start == 2
    assert match2.end == 3
    assert match2.pattern.name == "end"
Esempio n. 6
0
def test_get_unmatched():
    scanner = scan_tex("a.b", [Pattern("letter", r"[a-z]")],
                       include_unmatched=True)

    # Scan past the first token
    next(scanner)

    # The second token should be an unmatched pattern
    match2 = next(scanner)
    assert match2.pattern.name == "UNKNOWN"
    assert match2.start == 1
    assert match2.end == 2
    assert match2.text == "."
Esempio n. 7
0
 def parse(self, tex: str) -> Iterator[Bibitem]:
     bibitem_pattern = Pattern("bibitem",
                               r"\\bibitem.*?(?=\\bibitem|\n\n|$|\\end{)")
     for bibitem in scan_tex(tex, [bibitem_pattern]):
         try:
             bibitem_soup = parse_soup(bibitem.text)
         except TexSoupParseError:
             continue
         key = self._extract_key(bibitem_soup)
         tokens = self._extract_text(bibitem_soup)
         if key is None:
             logging.warning("Detected bibitem with null key %s. Skipping.",
                             str(bibitem_soup))
             continue
         yield Bibitem(key, tokens)
Esempio n. 8
0
    def parse(self, tex: str) -> Optional[Documentclass]:
        patterns = [
            Pattern("documentclass", r"\\documentclass"),
            Pattern("optional_arg", r"\[[^\]]*?\]"),
            Pattern("required_arg", r"{[^}]*?}"),
        ]

        match_stage = "start"
        start: int = -1
        required_arg = None

        scanner = scan_tex(tex, patterns, include_unmatched=True)
        for match in scanner:

            if match_stage == "start":
                if match.pattern.name != "documentclass":
                    continue
                start = match.start
                match_stage = "awaiting-required-arg"

            # Once we hit a token that's not the document class or argument, return the document
            # class if the required argument has been found; otherwise, abort.
            elif match.pattern.name == "UNKNOWN":
                if match_stage == "awaiting-optional-arg":
                    return Documentclass(start, match.start)
                if not match.text.isspace():
                    break

            elif match_stage == "awaiting-required-arg":
                if match.pattern.name == "required_arg":
                    match_stage = "awaiting-optional-arg"
                    required_arg = match

            elif match_stage == "awaiting-optional-arg":
                if match.pattern.name == "optional_arg":
                    end = match.end
                    return Documentclass(start, end)

        if required_arg is not None:
            return Documentclass(start, required_arg.end)
        return None
Esempio n. 9
0
    def parse(self, tex_path: str, tex: str) -> Iterator[PlaintextSegment]:
        """
        Extract plaintext segments from the TeX. Some TeX will be replaced (e.g., "\\\\" with "\n",
        equations with "[[math]]"). Other TeX will be skipped (e.g., macros, braces, and brackets).
        The 'text' property of the returned segments can be appended to form a string of plaintext.
        """
        # All math equations will be replaced in plaintext with the text "[[math]]". However,
        # returned segments also need to be labeled with their character positions from the
        # original TeX. In a first step, equations are detected using EquationExtractor, and then
        # replaced with a Unicode character (█) so that they can be easily detected in a second
        # step, while preserving the equations' character offsets in the TeX.
        tex_without_math = tex
        equation_extractor = EquationExtractor()
        for equation in equation_extractor.parse(tex_path, tex):
            tex_without_math = (tex_without_math[:equation.start] + "█" *
                                (equation.end - equation.start) +
                                tex_without_math[equation.end:])

        patterns = list(self.REPLACE_PATTERNS.keys()) + self.SKIP_PATTERNS
        scanner = scan_tex(tex_without_math, patterns, include_unmatched=True)

        # Iterate over all TeX. If a token is supposed to be replaced, replace it and yield the
        # span with the replaced text. If it's supposed to be ignored, discard it. Otherwise, yield
        # a new span with the TeX as plaintext.
        for match in scanner:
            if match.pattern in self.SKIP_PATTERNS:
                continue

            transformed = False
            text = match.text
            if match.pattern in self.REPLACE_PATTERNS:
                transformed = True
                text = self.REPLACE_PATTERNS[match.pattern]

            yield PlaintextSegment(
                text=text,
                transformed=transformed,
                tex_start=match.start,
                tex_end=match.end,
            )
Esempio n. 10
0
def _replace_unwanted_commands_with_spaces(tex: str) -> str:
    """
    KaTeX isn't programmed to support the entire vocabulary of LaTeX equation markup (though it
    does support a lot, see https://katex.org/docs/support_table.html).

    For those commands that we don't need to have parsed (e.g., 'label'), this function will
    strip those commands out, so that they cause KaTeX to crash or have unexpected behavior.
    'label', for example, if not removed, will have its argument parsed as an equation, and
    will be identified as consisting of many symbols.
    """
    UNWANTED_MACROS = [
        MacroDefinition("ref", "#1"),
        MacroDefinition("label", "#1"),
        MacroDefinition("nonumber", ""),
    ]
    macro_extractor = MacroExtractor()
    for macro_definition in UNWANTED_MACROS:
        for macro in macro_extractor.parse(tex, macro_definition):
            tex = _replace_substring_with_space(tex, macro.start, macro.end)

    length_assignment_extractor = EquationLengthAssignmentExtractor()
    length_assignments = length_assignment_extractor.parse(tex)
    for assignment in length_assignments:
        tex = _replace_substring_with_space(tex, assignment.start,
                                            assignment.end)

    UNWANTED_PATTERNS = [
        Pattern("ampersand", "&"),
        Pattern("split_start", begin_environment_regex("split")),
        Pattern("split_end", end_environment_regex("split")),
    ]
    unwanted_matches = scan_tex(tex, UNWANTED_PATTERNS)
    for match in unwanted_matches:
        tex = _replace_substring_with_space(tex, match.start, match.end)

    return tex
Esempio n. 11
0
def expand_tex(
    tex_dir: Path,
    tex_name: str,
    discover_by: FileDiscoveryStrategy = FileDiscoveryStrategy.EXACT,
    within: Optional[str] = None,
    is_input: bool = False,
) -> Optional[str]:
    """
    Unify the TeX in a file by combining together TeX from the files. The TeX file to be read is
    'tex_name' and it will be looked for in 'tex_dir'.

    Files can be searched for in the tex_dir according to special rules using the 'discover_by'
    parameter. The parameter can tell the method to resolve the TeX filename using the rules that
    are used by the '\\input'' or '\\include'' macros.

    The 'within' parameter makes sure this function doesn't read files it shouldn't. Input files
    are only expanded if their absolute resolved file path is inside the directory specified by
    'within'. If 'within' is not specified, then it will be set to 'tex_dir'.

    Based loosely on the code from the Perl latexpand utility in TeXLive, which is distributed under a
    BSD license: https://ctan.org/pkg/latexpand?lang=en

    Features not supported by this function are:
    * \\includeonly command (which specifies which \\include scripts to process)
    * handling quotation marks around input or included files. In some cases it will work the
      same as LaTeX does, and in some cases it won't. It seems how files are included
      that have quotes differs by LaTeX version https://tex.stackexchange.com/a/515259/198728
    * expanding files that don't use a 'utf-8'-compatible encoding. TeX files can include
      multiple input encodings, even within the same file. However, this function will not expand
      input that fail to open as UTF-8 files.
    """

    # Resolve path to TeX file, and make sure it's in a valid directory.
    within = os.path.abspath(os.path.realpath(within or tex_dir))
    qualified_tex_path = os.path.abspath(
        os.path.realpath(os.path.join(tex_dir, tex_name))
    )
    if os.path.commonpath([within, qualified_tex_path]) != within:
        logging.warning(  # pylint: disable=logging-not-lazy
            "TeX macro attempted to import file %s which is not in %s. This is forbidden. "
            + "This file will not be expanded.",
            qualified_tex_path,
            within,
        )
        return None

    # Add '.tex' extension to the file name if it is being imported using an '\include' macro.
    if discover_by == FileDiscoveryStrategy.INCLUDE:
        qualified_tex_path += ".tex"
    # Add the '.tex' extension to the file name as done for by the '\input' macro. As mentioned in
    # the TeXBook, "TEX automatically supplies the suffix '.tex' if no suffix has been specified."
    elif discover_by == FileDiscoveryStrategy.INPUT:
        if len(os.path.splitext(qualified_tex_path)[1]) == 0:
            qualified_tex_path += ".tex"

    if not os.path.exists(qualified_tex_path):
        logging.warning(  # pylint: disable=logging-not-lazy
            "Could not find file '%s' in directory '%s'. No text will be read from this file.",
            tex_name,
            tex_dir,
        )
        return None

    input_patterns = [
        # Put patterns with braces before those without braces so they have priority in matching.
        Pattern("input_braces", r"\\input\s*{([^}]+)}"),
        Pattern("input_quotes", r'\\input\s+"([^"]+)"'),
        Pattern("input", r"\\input\s+(\S+)"),
    ]
    # Note that while it's supported here, '\include' seem to be pretty rare in research papers.
    # In a specific sample of about 120 conference papers, only 5 had '\include' macros, yet
    # many more had '\input' commands). Only 1 used an '\include' macro to read in text.
    # The rest of the files used '\include' macros to include macros and usepackage statements.
    # XXX(andrewhead): The 'includes' patterns are currently disabled because the TeX that is
    # being inserted in their place is incorrect (i.e., it causes compilation errors).
    include_patterns: List[Pattern] = [
        # Pattern("include_braces", r"\\include\s*{([^}]+)}"),
        # Pattern("include", r"\\include\s+(\S+)"),
    ]
    endinput_pattern = Pattern("endinput", r"\\endinput( |\t|\b|\{.*?\})")
    patterns = input_patterns + include_patterns + [endinput_pattern]

    # Read TeX for a file.
    with open(qualified_tex_path, encoding="utf-8") as tex_file:
        try:
            tex = tex_file.read()
        except Exception as e:  # pylint: disable=broad-except
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not read file at %s due to error: %s. The TeX for this file will "
                + "not be expanded",
                qualified_tex_path,
                e,
            )
            return None

    replacements: List[Union[Expansion, EndInput]] = []
    endinputs = []
    end_file_at = None

    # Scan file for input macros, expanding them.
    for match in scan_tex(tex, patterns):

        # If a file is being read and the '\endinput' macro is reached, end output at the end of
        # the line that \endinput appears on. See the TeXBook for a description of the how
        # \endinput macro is handled.
        if match.pattern is endinput_pattern:
            endinput = EndInput(start=match.start, end=match.end)
            replacements.append(endinput)
            endinputs.append(endinput)

            # Find the newline after the \endinput, after which no more inputs should be expanded
            # and the file should be truncated.
            if end_file_at is None:
                end_of_line = re.compile("$", flags=re.MULTILINE)
                end_of_line_match = end_of_line.search(tex, pos=match.end)
                if end_of_line_match:
                    end_file_at = end_of_line_match.start()
                    continue

        # For input macros (e.g., '\input', '\include', ...)
        # Re-run the pattern against the matched text to extract the path to the file
        # that is meant to be included.
        match_with_groups = re.match(match.pattern.regex, match.text)
        if match_with_groups is None or len(match_with_groups.groups()) < 1:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Unexpected error in extracting path for input / include command %s using "
                + "regular expression %s",
                match.text,
                match.pattern.regex,
            )
            continue
        input_path = match_with_groups.group(1)

        # Clean up the path
        # In TeX, paths are specified in Unix format. Convert to platform-specific path format
        # to let the program search for and read the file.
        input_path = input_path.strip().replace(posixpath.sep, os.path.sep)

        # Expand the input by reading in the expanded text in the input file.
        discovery_strategy = (
            FileDiscoveryStrategy.INCLUDE
            if match.pattern in include_patterns
            else FileDiscoveryStrategy.INPUT
        )
        input_tex = expand_tex(
            # All inputs from expanded files will be resolved relative to the main
            # directory of the project (i.e., the one where the TeX executable is invoked):
            # https://tex.stackexchange.com/a/39084/198728
            tex_dir,
            input_path,
            discover_by=discovery_strategy,
            is_input=True,
            # Specify the 'within' parameter to make sure that all expanded files reside
            # in the directory where the main TeX file was expanded.
            within=within,
        )
        if input_tex is None:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not read input TeX file %s included from file %s in directory %s. "
                + "This input macro will not be expanded.",
                input_path,
                tex_name,
                tex_dir,
            )
            continue

        if match.pattern in include_patterns:
            input_tex = INCLUDE_EXPANSION.replace("<CONTENTS>", input_tex)
            input_tex = input_tex.replace("<FILENAME>", input_path)

        replacements.append(Expansion(start=match.start, end=match.end, tex=input_tex))

    # Truncate the TeX file after the end of a line where the first '\endinput' macro appears.
    expanded = tex
    if end_file_at is not None:
        expanded = expanded[:end_file_at]

    # Apply the expansions to the TeX.
    for replacement in reversed(replacements):
        if end_file_at is not None and replacement.start >= end_file_at:
            continue
        if isinstance(replacement, EndInput):
            expanded = expanded[: replacement.start] + "" + expanded[replacement.end :]
            continue
        if isinstance(replacement, Expansion):
            expanded = (
                expanded[: replacement.start]
                + replacement.tex
                + expanded[replacement.end :]
            )

    return expanded
Esempio n. 12
0
def extract_plaintext(tex_path: str, tex: str) -> JournaledString:
    """
    Extracts plaintext from TeX. Some TeX will be replaced (e.g., "\\\\" with "\n",
    equations with "<<equation-{id}>>"). Other TeX will be skipped (e.g., macros, braces, and brackets).

    The returned string is a 'JournaledString', which contains helper functions that allows
    the client to map from character offsets in the plaintext string back to character offsets in
    the original 'tex' string provided as input to this function.

    It's definitely not perfect: this extracted text will include text extracted from many
    command arguments, because we knew sometimes it would be wanted, and
    other times it wouldn't. Without more sophisticated macro processing, it's not possible to
    tell which arguments would be rendered as text and which wouldn't.

    For the use case of sentence boundary detection, spurious macro arguments are often
    okay to keep in the text as they only infrequently influence the detected boundaries. To
    support other natural language processing tasks, this extractor may need to be further refined.
    """
    # Patterns of text that should be replaced with other plaintext.
    REPLACE_PATTERNS = {
        # Separate sections and captions text from the rest of the text.
        Pattern("section", r"\s*\\(?:sub)*section\*?\{([^}]*)\}\s*"):
        "\n\n\\1.\n\n",
        Pattern("paragraph", r"\s*\\paragraph*?\{([^}]*)\}\s*"):
        "\n\n\\1.\n\n",
        Pattern("caption", r"(.)(?=\\caption\*?\{)"):
        "\\1\n\n",
        # Replace commands for which colorizing the contents will lead to compilation failures.
        CITATION_PATTERN:
        "Citation (\\1)",
        Pattern("label", r"\\label\{([^}]+)\}"):
        "(Label \\1)",
        Pattern("ref", r"\\(?:page|c)?ref\{([^}]+)\}"):
        "(Ref \\1)",
        Pattern("glossary_term", r"\\gls(?:pl)?\*?\{([^}]+)\}"):
        "Glossary term (\\1)",
        # Replace TeX source spaces with semantic spacing.
        Pattern("linebreak_keep", r"(\\\\|\\linebreak)|\n(\s)*\n\s*"):
        "\n",
        Pattern("linebreak_ignore", r"\n"):
        " ",
        Pattern("space_macro", r"\\[ ,]"):
        " ",
        Pattern("tilde", r"~"):
        " ",
        # Replace characters that need to be escaped in TeX with unescaped text.
        Pattern("ampersand", r"\\&"):
        "&",
    }

    # Patterns of text the extractor should skip.
    SKIP_PATTERNS = [
        # Include specific macros first, before the more general-purpose 'macro'.
        Pattern("input", r"\\(input|include)(\s+\S+|\{[^}]+\})"),
        # Many patterns below were written with reference to the LaTeX tokenizer in Python's
        # 'doctools' sources at:
        # http://svn.python.org/projects/doctools/converter/converter/tokenizer.py
        Pattern("environment_tags", r"\\(begin|end)\{[^}]*\}"),
        Pattern("macro", r"\\[a-zA-Z]+\*?[ \t]*"),
        RIGHT_BRACE,
        LEFT_BRACE,
        Pattern("left_bracket", r"\["),
        Pattern("right_bracket", r"\]"),
        # The following macros are a backslash followed by an ASCII symbol. This pattern was
        # written with reference to the command list at:
        # http://www.public.asu.edu/~rjansen/latexdoc/ltx-2.html
        # Pattern("symbol_macro", r"\\[@=><+'`-]"),
    ]

    # All math equations will be replaced in plaintext with the text "<<equation-{id}>>".
    # This ID should be the same as the one output by the equation pipeline.
    plaintext = JournaledString(tex)
    equation_extractor = EquationExtractor()
    equations = list(equation_extractor.parse(tex_path, tex))
    for equation in reversed(equations):
        plaintext = plaintext.edit(equation.start, equation.end,
                                   f"<<equation-{equation.id_}>>")

    patterns = list(REPLACE_PATTERNS.keys()) + SKIP_PATTERNS
    scanner = scan_tex(str(plaintext), patterns, include_unmatched=True)

    # If the scanner yields a span of text, the span is either:
    # 1. a pattern to skip
    # 2. a pattern to replace
    # 3. some other uncommented text
    # If some span of text is not returned by the scanner, then it is a comment,
    # or some other text that the scanner ignores. That text should be removed from the
    # plain text as if it was a pattern to skip.
    # Iterate over matches in reverse so as not to mess up character offsets for
    # earlier matches when replacing TeX in the string.
    keep_after = len(plaintext)
    for match in reversed(list(scanner)):
        if match.end < keep_after:
            plaintext = plaintext.edit(match.end, keep_after, "")
            keep_after = match.end
        if match.pattern in REPLACE_PATTERNS:
            plaintext = plaintext.edit(
                match.start,
                match.end,
                re.sub(match.pattern.regex, REPLACE_PATTERNS[match.pattern],
                       match.text),
            )
        if match.pattern not in SKIP_PATTERNS:
            keep_after = match.start

    if keep_after > 0:
        plaintext = plaintext.edit(0, keep_after, "")

    # Finally, remove adjacent periods (which interfere with the pysbd sentence
    # segmenter), which may only be adjacent because the TeX grouping has been removed.
    # Do a lookahead for the last period (don't include it in the match) in order
    # to change as little of the original TeX as possible, to make it easier to map
    # back from the original period position (which will often occur at the end of
    # an extracted sentence) to its precise position in the original TeX.
    for m in reversed(list(re.finditer(r"[\s\.]+(?=\.)", str(plaintext)))):
        plaintext = plaintext.edit(m.start(), m.end(), "")

    return plaintext
Esempio n. 13
0
def test_ignore_escaped_comment():
    pattern = Pattern("letter", r"[a-z]")
    match = next(scan_tex("\\%a", [pattern]))
    assert match.text == "a"
Esempio n. 14
0
def test_skip_comments():
    pattern = Pattern("letter", r"[a-z]")
    match = next(scan_tex("%a\na", [pattern]))
    assert match.start == 3
    assert match.end == 4
    assert match.text == "a"