예제 #1
0
def test_find_multiple():
    scanner = scan_tex("<.>", [Pattern("start", r"<"), Pattern("end", r">")])

    match1 = next(scanner)
    assert match1.start == 0
    assert match1.end == 1
    assert match1.pattern.name == "start"

    match2 = next(scanner)
    assert match2.start == 2
    assert match2.end == 3
    assert match2.pattern.name == "end"
예제 #2
0
    def _parse(self, tex: str,
               macro_definition: MacroDefinition) -> Iterator[Macro]:
        self.scanner = TexScanner(tex)  # pylint: disable=attribute-defined-outside-init
        name_pattern = Pattern("macro", r"\\" + macro_definition.name)

        # This loop will run until the scanner raises an 'EndOfInput' or indicates another error.
        while True:

            # Parse the macro name.
            step = self.scanner.next([name_pattern])
            macro_start = step.match.start
            token_end = step.match.end

            # Parse each of the expected tokens in the parameter string.
            tokens = re.split(r"(#\d+)", macro_definition.parameter_string)
            if tokens[0] == "":
                del tokens[0]
            if len(tokens) >= 1 and tokens[len(tokens) - 1] == "":
                del tokens[len(tokens) - 1]
            for i, token in enumerate(tokens):
                if re.match(r"#\d+", token):
                    if (i == len(tokens) - 1) or (re.match(
                            r"#\d+", tokens[i + 1])):
                        token_end = self._scan_undelimited_parameter()
                    else:
                        token_end = self._scan_delimited_parameter(
                            tokens[i + 1], tex)
                else:
                    token_end = self._scan_delimiter(token)

            # The macros text is the text of the name and all parameters.
            yield Macro(macro_start, token_end, tex[macro_start:token_end])
예제 #3
0
def test_find_pattern():
    pattern = Pattern("letter", r"[a-z]")
    match = next(scan_tex("a", [pattern]))
    assert match.start == 0
    assert match.end == 1
    assert match.pattern.name == "letter"
    assert match.text == "a"
예제 #4
0
 def parse(self, tex: str) -> Optional[BeginDocument]:
     pattern = Pattern("begin_document", r"\\begin{document}")
     scanner = scan_tex(tex, [pattern], include_unmatched=False)
     try:
         match = next(scanner)
         return BeginDocument(match.start, match.end)
     except StopIteration:
         return None
예제 #5
0
 def _scan_delimiter(self, delimiter: str) -> int:
     pattern = Pattern("delimiter", re.escape(delimiter))
     step = self.scanner.next([pattern], include_unmatched=True)
     if step.skipped is not None and len(step.skipped) > 0:
         logging.warning(
             "Unexpectedly found unmatched text before macro argument delimiter."
         )
     return step.match.end
예제 #6
0
    def __init__(self) -> None:

        # Patterns of text that should be replaced with other plaintext.
        self.REPLACE_PATTERNS = {
            Pattern("backslash_newline", r"\\\\"):
            "\n",
            Pattern("space_macro", r"\\[ ,]"):
            " ",
            Pattern("tilde", r"~"):
            " ",
            # See why we use this strange character for equations in the 'parse' method.
            Pattern("math", r"█+"):
            "[[math]]",
        }

        # Patterns of text the extractor should skip.
        self.SKIP_PATTERNS = [
            # Many patterns below were written with reference to the LaTeX tokenizer in Python's
            # 'doctools' sources at:
            # http://svn.python.org/projects/doctools/converter/converter/tokenizer.py
            Pattern("macro", r"\\[a-zA-Z]+\*?[ \t]*"),
            RIGHT_BRACE,
            LEFT_BRACE,
            Pattern("left_bracket", r"\["),
            Pattern("right_bracket", r"\]"),
            # The following macros are a backslash followed by an ASCII symbol. This pattern was
            # written with reference to the command list at:
            # http://www.public.asu.edu/~rjansen/latexdoc/ltx-2.html
            # Pattern("symbol_macro", r"\\[@=><+'`-]"),
        ]
예제 #7
0
 def parse(self, tex: str) -> Iterator[LengthAssignment]:
     parameter_names_pattern = (
         r"(?:" + "|".join([r"\\" + p for p in ARRAY_PARAMETERS]) + ")")
     unit_pattern = r"(?:" + "|".join(LENGTH_UNITS) + ")"
     assignment_pattern = (parameter_names_pattern + r"\s*=\s*[0-9\.]+\s*" +
                           unit_pattern)
     pattern = Pattern("length_assignment", assignment_pattern)
     scanner = scan_tex(tex, [pattern])
     for match in scanner:
         yield LengthAssignment(match.start, match.end)
예제 #8
0
    def parse(self, tex: str) -> Optional[Documentclass]:
        patterns = [
            Pattern("documentclass", r"\\documentclass"),
            Pattern("optional_arg", r"\[[^\]]*?\]"),
            Pattern("required_arg", r"{[^}]*?}"),
        ]

        match_stage = "start"
        start: int = -1
        required_arg = None

        scanner = scan_tex(tex, patterns, include_unmatched=True)
        for match in scanner:

            if match_stage == "start":
                if match.pattern.name != "documentclass":
                    continue
                start = match.start
                match_stage = "awaiting-required-arg"

            # Once we hit a token that's not the document class or argument, return the document
            # class if the required argument has been found; otherwise, abort.
            elif match.pattern.name == "UNKNOWN":
                if match_stage == "awaiting-optional-arg":
                    return Documentclass(start, match.start)
                if not match.text.isspace():
                    break

            elif match_stage == "awaiting-required-arg":
                if match.pattern.name == "required_arg":
                    match_stage = "awaiting-optional-arg"
                    required_arg = match

            elif match_stage == "awaiting-optional-arg":
                if match.pattern.name == "optional_arg":
                    end = match.end
                    return Documentclass(start, end)

        if required_arg is not None:
            return Documentclass(start, required_arg.end)
        return None
예제 #9
0
    def _scan_delimited_parameter(self, delimiter: str, tex: str) -> int:
        scan_start = self.scanner.i

        # Scan for the delimiter with a lookahead so that the scanner doesn't consume the tokens
        # for the delimiter while searching for it.
        delimiter_pattern = Pattern("delimiter",
                                    "(?=" + re.escape(delimiter) + ")")

        while True:
            step = self.scanner.next([delimiter_pattern])
            text_before_delimiter = tex[scan_start:step.match.start]
            if has_balanced_braces(text_before_delimiter):
                return step.match.start
예제 #10
0
def test_get_unmatched():
    scanner = scan_tex("a.b", [Pattern("letter", r"[a-z]")],
                       include_unmatched=True)

    # Scan past the first token
    next(scanner)

    # The second token should be an unmatched pattern
    match2 = next(scanner)
    assert match2.pattern.name == "UNKNOWN"
    assert match2.start == 1
    assert match2.end == 2
    assert match2.text == "."
예제 #11
0
 def parse(self, tex: str) -> Iterator[Bibitem]:
     bibitem_pattern = Pattern("bibitem",
                               r"\\bibitem.*?(?=\\bibitem|\n\n|$|\\end{)")
     for bibitem in scan_tex(tex, [bibitem_pattern]):
         try:
             bibitem_soup = parse_soup(bibitem.text)
         except TexSoupParseError:
             continue
         key = self._extract_key(bibitem_soup)
         tokens = self._extract_text(bibitem_soup)
         if key is None:
             logging.warning("Detected bibitem with null key %s. Skipping.",
                             str(bibitem_soup))
             continue
         yield Bibitem(key, tokens)
예제 #12
0
def _replace_unwanted_commands_with_spaces(tex: str) -> str:
    """
    KaTeX isn't programmed to support the entire vocabulary of LaTeX equation markup (though it
    does support a lot, see https://katex.org/docs/support_table.html).

    For those commands that we don't need to have parsed (e.g., 'label'), this function will
    strip those commands out, so that they cause KaTeX to crash or have unexpected behavior.
    'label', for example, if not removed, will have its argument parsed as an equation, and
    will be identified as consisting of many symbols.
    """
    UNWANTED_MACROS = [
        MacroDefinition("ref", "#1"),
        MacroDefinition("label", "#1"),
        MacroDefinition("nonumber", ""),
    ]
    macro_extractor = MacroExtractor()
    for macro_definition in UNWANTED_MACROS:
        for macro in macro_extractor.parse(tex, macro_definition):
            tex = _replace_substring_with_space(tex, macro.start, macro.end)

    length_assignment_extractor = EquationLengthAssignmentExtractor()
    length_assignments = length_assignment_extractor.parse(tex)
    for assignment in length_assignments:
        tex = _replace_substring_with_space(tex, assignment.start,
                                            assignment.end)

    UNWANTED_PATTERNS = [
        Pattern("ampersand", "&"),
        Pattern("split_start", begin_environment_regex("split")),
        Pattern("split_end", end_environment_regex("split")),
    ]
    unwanted_matches = scan_tex(tex, UNWANTED_PATTERNS)
    for match in unwanted_matches:
        tex = _replace_substring_with_space(tex, match.start, match.end)

    return tex
예제 #13
0
    def _scan_undelimited_parameter(self) -> int:
        patterns = [LEFT_BRACE, Pattern("nonspace_character", r"\S")]
        step = self.scanner.next(patterns)

        # If a non-space character, match just the first character.
        if step.match.pattern.name == "nonspace_character":
            return step.match.end

        # If the first match is a left-brace, parse until the braces are balanced.
        brace_depth = 1
        brace_patterns = [LEFT_BRACE, RIGHT_BRACE]
        while True:
            step = self.scanner.next(brace_patterns)
            if step.match.pattern.name == "left_brace":
                brace_depth += 1
            elif step.match.pattern.name == "right_brace":
                brace_depth -= 1
            if brace_depth == 0:
                return step.match.end
예제 #14
0
def make_math_environment_patterns() -> List[Pattern]:

    begin = begin_environment_regex
    end = end_environment_regex

    patterns: List[Pattern] = []
    for name, spec in MATH_ENVIRONMENT_SPECS.items():
        if isinstance(spec, DelimitedEnv):
            patterns.append(Pattern(name + "_delimiter", spec.delimiter))
        elif isinstance(spec, StartEndEnv):
            patterns.append(Pattern(name + "_start", spec.start))
            patterns.append(Pattern(name + "_end", spec.end))
        elif isinstance(spec, NamedEnv):
            patterns.append(
                Pattern(name + "_start", begin(spec.name, spec.arg_pattern)))
            patterns.append(Pattern(name + "_end", end(spec.name)))
            if spec.star:
                patterns.append(
                    Pattern(name + "s_start",
                            begin(spec.name + r"\*", spec.arg_pattern)))
                patterns.append(Pattern(name + "s_end",
                                        end(spec.name + r"\*")))
    return patterns
예제 #15
0
def test_skip_comments():
    pattern = Pattern("letter", r"[a-z]")
    match = next(scan_tex("%a\na", [pattern]))
    assert match.start == 3
    assert match.end == 4
    assert match.text == "a"
예제 #16
0
    """
    Interface for a class that extracts entities from TeX. Implement this interface when you
    intend to detect and colorize a new type of entity. This interface enforces the need to return
    'SerializableEntity's, which are embellished with unique identifiers that will be used in
    later stages of the pipeline, and the entity's TeX for debugging purposes.
    """
    @abstractmethod
    def parse(self, tex_path: str, tex: str) -> Iterator[SerializableEntity]:
        """
        Parse the 'tex', returning an iterator over the entities found. Entity extractors should
        not need to use the 'tex_path' for anything except for setting the 'tex_path' attribute
        on extracted entities.
        """


LEFT_BRACE = Pattern("left_brace", r"\{")
RIGHT_BRACE = Pattern("right_brace", r"\}")


@dataclass(frozen=True)
class NamedEnv:
    name: str
    star: bool
    arg_pattern: str = ""


@dataclass(frozen=True)
class DelimitedEnv:
    delimiter: str

예제 #17
0
def extract_plaintext(tex_path: str, tex: str) -> JournaledString:
    """
    Extracts plaintext from TeX. Some TeX will be replaced (e.g., "\\\\" with "\n",
    equations with "<<equation-{id}>>"). Other TeX will be skipped (e.g., macros, braces, and brackets).

    The returned string is a 'JournaledString', which contains helper functions that allows
    the client to map from character offsets in the plaintext string back to character offsets in
    the original 'tex' string provided as input to this function.

    It's definitely not perfect: this extracted text will include text extracted from many
    command arguments, because we knew sometimes it would be wanted, and
    other times it wouldn't. Without more sophisticated macro processing, it's not possible to
    tell which arguments would be rendered as text and which wouldn't.

    For the use case of sentence boundary detection, spurious macro arguments are often
    okay to keep in the text as they only infrequently influence the detected boundaries. To
    support other natural language processing tasks, this extractor may need to be further refined.
    """
    # Patterns of text that should be replaced with other plaintext.
    REPLACE_PATTERNS = {
        # Separate sections and captions text from the rest of the text.
        Pattern("section", r"\s*\\(?:sub)*section\*?\{([^}]*)\}\s*"):
        "\n\n\\1.\n\n",
        Pattern("paragraph", r"\s*\\paragraph*?\{([^}]*)\}\s*"):
        "\n\n\\1.\n\n",
        Pattern("caption", r"(.)(?=\\caption\*?\{)"):
        "\\1\n\n",
        # Replace commands for which colorizing the contents will lead to compilation failures.
        CITATION_PATTERN:
        "Citation (\\1)",
        Pattern("label", r"\\label\{([^}]+)\}"):
        "(Label \\1)",
        Pattern("ref", r"\\(?:page|c)?ref\{([^}]+)\}"):
        "(Ref \\1)",
        Pattern("glossary_term", r"\\gls(?:pl)?\*?\{([^}]+)\}"):
        "Glossary term (\\1)",
        # Replace TeX source spaces with semantic spacing.
        Pattern("linebreak_keep", r"(\\\\|\\linebreak)|\n(\s)*\n\s*"):
        "\n",
        Pattern("linebreak_ignore", r"\n"):
        " ",
        Pattern("space_macro", r"\\[ ,]"):
        " ",
        Pattern("tilde", r"~"):
        " ",
        # Replace characters that need to be escaped in TeX with unescaped text.
        Pattern("ampersand", r"\\&"):
        "&",
    }

    # Patterns of text the extractor should skip.
    SKIP_PATTERNS = [
        # Include specific macros first, before the more general-purpose 'macro'.
        Pattern("input", r"\\(input|include)(\s+\S+|\{[^}]+\})"),
        # Many patterns below were written with reference to the LaTeX tokenizer in Python's
        # 'doctools' sources at:
        # http://svn.python.org/projects/doctools/converter/converter/tokenizer.py
        Pattern("environment_tags", r"\\(begin|end)\{[^}]*\}"),
        Pattern("macro", r"\\[a-zA-Z]+\*?[ \t]*"),
        RIGHT_BRACE,
        LEFT_BRACE,
        Pattern("left_bracket", r"\["),
        Pattern("right_bracket", r"\]"),
        # The following macros are a backslash followed by an ASCII symbol. This pattern was
        # written with reference to the command list at:
        # http://www.public.asu.edu/~rjansen/latexdoc/ltx-2.html
        # Pattern("symbol_macro", r"\\[@=><+'`-]"),
    ]

    # All math equations will be replaced in plaintext with the text "<<equation-{id}>>".
    # This ID should be the same as the one output by the equation pipeline.
    plaintext = JournaledString(tex)
    equation_extractor = EquationExtractor()
    equations = list(equation_extractor.parse(tex_path, tex))
    for equation in reversed(equations):
        plaintext = plaintext.edit(equation.start, equation.end,
                                   f"<<equation-{equation.id_}>>")

    patterns = list(REPLACE_PATTERNS.keys()) + SKIP_PATTERNS
    scanner = scan_tex(str(plaintext), patterns, include_unmatched=True)

    # If the scanner yields a span of text, the span is either:
    # 1. a pattern to skip
    # 2. a pattern to replace
    # 3. some other uncommented text
    # If some span of text is not returned by the scanner, then it is a comment,
    # or some other text that the scanner ignores. That text should be removed from the
    # plain text as if it was a pattern to skip.
    # Iterate over matches in reverse so as not to mess up character offsets for
    # earlier matches when replacing TeX in the string.
    keep_after = len(plaintext)
    for match in reversed(list(scanner)):
        if match.end < keep_after:
            plaintext = plaintext.edit(match.end, keep_after, "")
            keep_after = match.end
        if match.pattern in REPLACE_PATTERNS:
            plaintext = plaintext.edit(
                match.start,
                match.end,
                re.sub(match.pattern.regex, REPLACE_PATTERNS[match.pattern],
                       match.text),
            )
        if match.pattern not in SKIP_PATTERNS:
            keep_after = match.start

    if keep_after > 0:
        plaintext = plaintext.edit(0, keep_after, "")

    # Finally, remove adjacent periods (which interfere with the pysbd sentence
    # segmenter), which may only be adjacent because the TeX grouping has been removed.
    # Do a lookahead for the last period (don't include it in the match) in order
    # to change as little of the original TeX as possible, to make it easier to map
    # back from the original period position (which will often occur at the end of
    # an extracted sentence) to its precise position in the original TeX.
    for m in reversed(list(re.finditer(r"[\s\.]+(?=\.)", str(plaintext)))):
        plaintext = plaintext.edit(m.start(), m.end(), "")

    return plaintext
예제 #18
0
def test_ignore_escaped_comment():
    pattern = Pattern("letter", r"[a-z]")
    match = next(scan_tex("\\%a", [pattern]))
    assert match.text == "a"
예제 #19
0
    """
    Interface for a class that extracts entities from TeX. Implement this interface when you
    intend to detect and colorize a new type of entity. This interface enforces the need to return
    'SerializableEntity's, which are embellished with unique identifiers that will be used in
    later stages of the pipeline, and the entity's TeX for debugging purposes.
    """
    @abstractmethod
    def parse(self, tex_path: str, tex: str) -> Iterator[SerializableEntity]:
        """
        Parse the 'tex', returning an iterator over the entities found. Entity extractors should
        not need to use the 'tex_path' for anything except for setting the 'tex_path' attribute
        on extracted entities.
        """


LEFT_BRACE = Pattern("left_brace", r"\{")
RIGHT_BRACE = Pattern("right_brace", r"\}")


@dataclass(frozen=True)
class NamedEnv:
    name: str
    star: bool
    arg_pattern: str = ""


@dataclass(frozen=True)
class DelimitedEnv:
    delimiter: str

예제 #20
0
def expand_tex(
    tex_dir: Path,
    tex_name: str,
    discover_by: FileDiscoveryStrategy = FileDiscoveryStrategy.EXACT,
    within: Optional[str] = None,
    is_input: bool = False,
) -> Optional[str]:
    """
    Unify the TeX in a file by combining together TeX from the files. The TeX file to be read is
    'tex_name' and it will be looked for in 'tex_dir'.

    Files can be searched for in the tex_dir according to special rules using the 'discover_by'
    parameter. The parameter can tell the method to resolve the TeX filename using the rules that
    are used by the '\\input'' or '\\include'' macros.

    The 'within' parameter makes sure this function doesn't read files it shouldn't. Input files
    are only expanded if their absolute resolved file path is inside the directory specified by
    'within'. If 'within' is not specified, then it will be set to 'tex_dir'.

    Based loosely on the code from the Perl latexpand utility in TeXLive, which is distributed under a
    BSD license: https://ctan.org/pkg/latexpand?lang=en

    Features not supported by this function are:
    * \\includeonly command (which specifies which \\include scripts to process)
    * handling quotation marks around input or included files. In some cases it will work the
      same as LaTeX does, and in some cases it won't. It seems how files are included
      that have quotes differs by LaTeX version https://tex.stackexchange.com/a/515259/198728
    * expanding files that don't use a 'utf-8'-compatible encoding. TeX files can include
      multiple input encodings, even within the same file. However, this function will not expand
      input that fail to open as UTF-8 files.
    """

    # Resolve path to TeX file, and make sure it's in a valid directory.
    within = os.path.abspath(os.path.realpath(within or tex_dir))
    qualified_tex_path = os.path.abspath(
        os.path.realpath(os.path.join(tex_dir, tex_name))
    )
    if os.path.commonpath([within, qualified_tex_path]) != within:
        logging.warning(  # pylint: disable=logging-not-lazy
            "TeX macro attempted to import file %s which is not in %s. This is forbidden. "
            + "This file will not be expanded.",
            qualified_tex_path,
            within,
        )
        return None

    # Add '.tex' extension to the file name if it is being imported using an '\include' macro.
    if discover_by == FileDiscoveryStrategy.INCLUDE:
        qualified_tex_path += ".tex"
    # Add the '.tex' extension to the file name as done for by the '\input' macro. As mentioned in
    # the TeXBook, "TEX automatically supplies the suffix '.tex' if no suffix has been specified."
    elif discover_by == FileDiscoveryStrategy.INPUT:
        if len(os.path.splitext(qualified_tex_path)[1]) == 0:
            qualified_tex_path += ".tex"

    if not os.path.exists(qualified_tex_path):
        logging.warning(  # pylint: disable=logging-not-lazy
            "Could not find file '%s' in directory '%s'. No text will be read from this file.",
            tex_name,
            tex_dir,
        )
        return None

    input_patterns = [
        # Put patterns with braces before those without braces so they have priority in matching.
        Pattern("input_braces", r"\\input\s*{([^}]+)}"),
        Pattern("input_quotes", r'\\input\s+"([^"]+)"'),
        Pattern("input", r"\\input\s+(\S+)"),
    ]
    # Note that while it's supported here, '\include' seem to be pretty rare in research papers.
    # In a specific sample of about 120 conference papers, only 5 had '\include' macros, yet
    # many more had '\input' commands). Only 1 used an '\include' macro to read in text.
    # The rest of the files used '\include' macros to include macros and usepackage statements.
    # XXX(andrewhead): The 'includes' patterns are currently disabled because the TeX that is
    # being inserted in their place is incorrect (i.e., it causes compilation errors).
    include_patterns: List[Pattern] = [
        # Pattern("include_braces", r"\\include\s*{([^}]+)}"),
        # Pattern("include", r"\\include\s+(\S+)"),
    ]
    endinput_pattern = Pattern("endinput", r"\\endinput( |\t|\b|\{.*?\})")
    patterns = input_patterns + include_patterns + [endinput_pattern]

    # Read TeX for a file.
    with open(qualified_tex_path, encoding="utf-8") as tex_file:
        try:
            tex = tex_file.read()
        except Exception as e:  # pylint: disable=broad-except
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not read file at %s due to error: %s. The TeX for this file will "
                + "not be expanded",
                qualified_tex_path,
                e,
            )
            return None

    replacements: List[Union[Expansion, EndInput]] = []
    endinputs = []
    end_file_at = None

    # Scan file for input macros, expanding them.
    for match in scan_tex(tex, patterns):

        # If a file is being read and the '\endinput' macro is reached, end output at the end of
        # the line that \endinput appears on. See the TeXBook for a description of the how
        # \endinput macro is handled.
        if match.pattern is endinput_pattern:
            endinput = EndInput(start=match.start, end=match.end)
            replacements.append(endinput)
            endinputs.append(endinput)

            # Find the newline after the \endinput, after which no more inputs should be expanded
            # and the file should be truncated.
            if end_file_at is None:
                end_of_line = re.compile("$", flags=re.MULTILINE)
                end_of_line_match = end_of_line.search(tex, pos=match.end)
                if end_of_line_match:
                    end_file_at = end_of_line_match.start()
                    continue

        # For input macros (e.g., '\input', '\include', ...)
        # Re-run the pattern against the matched text to extract the path to the file
        # that is meant to be included.
        match_with_groups = re.match(match.pattern.regex, match.text)
        if match_with_groups is None or len(match_with_groups.groups()) < 1:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Unexpected error in extracting path for input / include command %s using "
                + "regular expression %s",
                match.text,
                match.pattern.regex,
            )
            continue
        input_path = match_with_groups.group(1)

        # Clean up the path
        # In TeX, paths are specified in Unix format. Convert to platform-specific path format
        # to let the program search for and read the file.
        input_path = input_path.strip().replace(posixpath.sep, os.path.sep)

        # Expand the input by reading in the expanded text in the input file.
        discovery_strategy = (
            FileDiscoveryStrategy.INCLUDE
            if match.pattern in include_patterns
            else FileDiscoveryStrategy.INPUT
        )
        input_tex = expand_tex(
            # All inputs from expanded files will be resolved relative to the main
            # directory of the project (i.e., the one where the TeX executable is invoked):
            # https://tex.stackexchange.com/a/39084/198728
            tex_dir,
            input_path,
            discover_by=discovery_strategy,
            is_input=True,
            # Specify the 'within' parameter to make sure that all expanded files reside
            # in the directory where the main TeX file was expanded.
            within=within,
        )
        if input_tex is None:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not read input TeX file %s included from file %s in directory %s. "
                + "This input macro will not be expanded.",
                input_path,
                tex_name,
                tex_dir,
            )
            continue

        if match.pattern in include_patterns:
            input_tex = INCLUDE_EXPANSION.replace("<CONTENTS>", input_tex)
            input_tex = input_tex.replace("<FILENAME>", input_path)

        replacements.append(Expansion(start=match.start, end=match.end, tex=input_tex))

    # Truncate the TeX file after the end of a line where the first '\endinput' macro appears.
    expanded = tex
    if end_file_at is not None:
        expanded = expanded[:end_file_at]

    # Apply the expansions to the TeX.
    for replacement in reversed(replacements):
        if end_file_at is not None and replacement.start >= end_file_at:
            continue
        if isinstance(replacement, EndInput):
            expanded = expanded[: replacement.start] + "" + expanded[replacement.end :]
            continue
        if isinstance(replacement, Expansion):
            expanded = (
                expanded[: replacement.start]
                + replacement.tex
                + expanded[replacement.end :]
            )

    return expanded