def _replace_unwanted_commands_with_spaces(tex: str) -> str: """ KaTeX isn't programmed to support the entire vocabulary of LaTeX equation markup (though it does support a lot, see https://katex.org/docs/support_table.html). For those commands that we don't need to have parsed (e.g., 'label'), this function will strip those commands out, so that they cause KaTeX to crash or have unexpected behavior. 'label', for example, if not removed, will have its argument parsed as an equation, and will be identified as consisting of many symbols. """ UNWANTED_MACROS = [ MacroDefinition("ref", "#1"), MacroDefinition("label", "#1"), MacroDefinition("nonumber", ""), ] macro_extractor = MacroExtractor() for macro_definition in UNWANTED_MACROS: for macro in macro_extractor.parse(tex, macro_definition): tex = _replace_substring_with_space(tex, macro.start, macro.end) length_assignment_extractor = EquationLengthAssignmentExtractor() length_assignments = length_assignment_extractor.parse(tex) for assignment in length_assignments: tex = _replace_substring_with_space(tex, assignment.start, assignment.end) UNWANTED_PATTERNS = [ Pattern("ampersand", "&"), Pattern("split_start", begin_environment_regex("split")), Pattern("split_end", end_environment_regex("split")), ] unwanted_matches = scan_tex(tex, UNWANTED_PATTERNS) for match in unwanted_matches: tex = _replace_substring_with_space(tex, match.start, match.end) return tex
def _parse(self, tex: str, macro_definition: MacroDefinition) -> Iterator[Macro]: self.scanner = TexScanner(tex) # pylint: disable=attribute-defined-outside-init name_pattern = Pattern("macro", r"\\" + macro_definition.name) # This loop will run until the scanner raises an 'EndOfInput' or indicates another error. while True: # Parse the macro name. step = self.scanner.next([name_pattern]) macro_start = step.match.start token_end = step.match.end # Parse each of the expected tokens in the parameter string. tokens = re.split(r"(#\d+)", macro_definition.parameter_string) if tokens[0] == "": del tokens[0] if len(tokens) >= 1 and tokens[len(tokens) - 1] == "": del tokens[len(tokens) - 1] for i, token in enumerate(tokens): if re.match(r"#\d+", token): if (i == len(tokens) - 1) or (re.match(r"#\d+", tokens[i + 1])): token_end = self._scan_undelimited_parameter() else: token_end = self._scan_delimited_parameter(tokens[i + 1], tex) else: token_end = self._scan_delimiter(token) # The macros text is the text of the name and all parameters. yield Macro(macro_start, token_end, tex[macro_start:token_end])
def parse(self, tex: str) -> Optional[EndDocument]: pattern = Pattern("begin_document", r"\\end{document}") scanner = scan_tex(tex, [pattern], include_unmatched=False) try: match = next(scanner) return EndDocument(match.start, match.end) except StopIteration: return None
def _scan_delimiter(self, delimiter: str) -> int: pattern = Pattern("delimiter", re.escape(delimiter)) step = self.scanner.next([pattern], include_unmatched=True) if step.skipped is not None and len(step.skipped) > 0: logging.warning( "Unexpectedly found unmatched text before macro argument delimiter." ) return step.match.end
def make_math_environment_patterns() -> List[Pattern]: begin = begin_environment_regex end = end_environment_regex patterns: List[Pattern] = [] for name, spec in MATH_ENVIRONMENT_SPECS.items(): if isinstance(spec, DelimitedEnv): patterns.append(Pattern(name + "_delimiter", spec.delimiter)) elif isinstance(spec, StartEndEnv): patterns.append(Pattern(name + "_start", spec.start)) patterns.append(Pattern(name + "_end", spec.end)) elif isinstance(spec, NamedEnv): patterns.append( Pattern(name + "_start", begin(spec.name, spec.arg_pattern)) ) patterns.append(Pattern(name + "_end", end(spec.name))) if spec.star: patterns.append( Pattern( name + "s_start", begin(spec.name + r"\*", spec.arg_pattern) ) ) patterns.append(Pattern(name + "s_end", end(spec.name + r"\*"))) return patterns
def parse(self, tex: str) -> Optional[Documentclass]: patterns = [ Pattern("documentclass", r"\\documentclass"), Pattern("optional_arg", r"\[[^\]]*?\]"), Pattern("required_arg", r"{[^}]*?}"), ] match_stage = "start" start: int = -1 required_arg = None scanner = scan_tex(tex, patterns, include_unmatched=True) for match in scanner: if match_stage == "start": if match.pattern.name != "documentclass": continue start = match.start match_stage = "awaiting-required-arg" # Once we hit a token that's not the document class or argument, return the document # class if the required argument has been found; otherwise, abort. elif match.pattern.name == "UNKNOWN": if match_stage == "awaiting-optional-arg": return Documentclass(start, match.start) if not match.text.isspace(): break elif match_stage == "awaiting-required-arg": if match.pattern.name == "required_arg": match_stage = "awaiting-optional-arg" required_arg = match elif match_stage == "awaiting-optional-arg": if match.pattern.name == "optional_arg": end = match.end return Documentclass(start, end) if required_arg is not None: return Documentclass(start, required_arg.end) return None
def _scan_delimited_parameter(self, delimiter: str, tex: str) -> int: scan_start = self.scanner.i # Scan for the delimiter with a lookahead so that the scanner doesn't consume the tokens # for the delimiter while searching for it. delimiter_pattern = Pattern("delimiter", "(?=" + re.escape(delimiter) + ")") while True: step = self.scanner.next([delimiter_pattern]) text_before_delimiter = tex[scan_start : step.match.start] if has_balanced_braces(text_before_delimiter): return step.match.start
def parse(self, tex: str) -> Iterator[LengthAssignment]: parameter_names_pattern = ( r"(?:" + "|".join([r"\\" + p for p in ARRAY_PARAMETERS]) + ")" ) unit_pattern = r"(?:" + "|".join(LENGTH_UNITS) + ")" assignment_pattern = ( parameter_names_pattern + r"\s*=\s*[0-9\.]+\s*" + unit_pattern ) pattern = Pattern("length_assignment", assignment_pattern) scanner = scan_tex(tex, [pattern]) for match in scanner: yield LengthAssignment(match.start, match.end)
def _scan_undelimited_parameter(self) -> int: patterns = [LEFT_BRACE, Pattern("nonspace_character", r"\S")] step = self.scanner.next(patterns) # If a non-space character, match just the first character. if step.match.pattern.name == "nonspace_character": return step.match.end # If the first match is a left-brace, parse until the braces are balanced. brace_depth = 1 brace_patterns = [LEFT_BRACE, RIGHT_BRACE] while True: step = self.scanner.next(brace_patterns) if step.match.pattern.name == "left_brace": brace_depth += 1 elif step.match.pattern.name == "right_brace": brace_depth -= 1 if brace_depth == 0: return step.match.end
content_tex: str " TeX for the equation contents, inside the environment (e.g., 'x + y'). " katex_compatible_tex: str " A santized version of the equation content meant for KaTeX parsing. " depth: int """ Depth within a tree of equations. Most equations will not be nested in others, so will have a depth of 0 if not nested in another equation. As an example, if this equation is nested in another equation, which is nested in another equation, it will have a depth of 2. """ LEFT_BRACE = Pattern("left_brace", r"\{") RIGHT_BRACE = Pattern("right_brace", r"\}") @dataclass(frozen=True) class NamedEnv: name: str star: bool arg_pattern: str = "" @dataclass(frozen=True) class DelimitedEnv: delimiter: str