def test_many_replacements(): s = JournaledString("hello world") edited = s.edit(6, 11, "moon") edited = edited.edit(6, 8, "ballo") edited = edited.edit(1, 2, "i") edited = edited.edit(5, 5, ",") assert edited == "hillo, balloon"
def test_substring(): s = JournaledString("starter string") edited = s.edit(5, 7, "ing") sub = edited.substring(3, 10) assert sub == "rting s" assert sub.initial == "rter s" assert sub.initial_offsets(0, 1) == (0, 1) assert sub.initial_offsets(3, 4) == (2, 4)
def test_map_to_mutated_offsets(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") # Likewise (to the note in the above test), when mapping offsets from the # initial string to the mutated string, it's not clear where exactly offsets from the # old string map to in a replaced range. So conservatively map offsets in the # old range to entire segments in the mutated string. assert edited.current_offsets(0, 1) == (0, 1) assert edited.current_offsets(9, 9) == (8, 15)
def test_uncertainty_grows_with_overlapping_edits(): s = JournaledString("starter string") edited1 = s.edit(1, 7, "tarted") edited2 = edited1.edit(7, 13, " stron") edited3 = edited2.edit(5, 8, "ing_") # The third edit crosses the spans of the two edits before it. The string can't tell # where characters in the third edited span map---whether to the initial span on the # left side or the right side, so it conservatively suggests outer bounds at the # edges of all edits so far. assert edited3.initial_offsets(6, 7) == (1, 13)
def test_map_to_initial_offsets(): s = JournaledString("starter string") edited = s.edit(8, 15, "changed") assert edited.initial_offsets(0, 1) == (0, 1) # Offsets within replacement text map to the offset of the start # of the replaced span, because it's not clear which characters in # the replaced substring (if any) correspond to the initial string. assert edited.initial_offsets(8, 9) == (8, 14) assert edited.initial_offsets(9, 9) == (8, 14) assert edited.initial_offsets(15, 15) == (14, 14)
def delimit_equations(s: JournaledString, equations: List[Equation]) -> JournaledString: " Replace delimiters around TeX equations with standardized delimiters. " replacements: Dict[CharacterRange, str] = {} def needs_space_before(s: JournaledString, character_index: int) -> bool: return character_index > 0 and not s[character_index - 1].isspace() def needs_space_after(s: JournaledString, character_index: int) -> bool: # If equation is used as a possessive (e.g., $x$'s), then don't add a space # after the equation, as it can interfere with pysbd's sentence splitting. While it # requires further investigation, it may be that putting an apostrophe after a space # makes pysbd think that the apostrophe is an opening single quote mark. if (character_index < len(s) - 1) and ( s[character_index:character_index + 2] == "'s"): return False if character_index < len(s) and not s[character_index].isspace(): return True return False for equation in equations: start_replacement_range = CharacterRange(equation.start, equation.content_start) start_replacement = f"EQUATION_DEPTH_{equation.depth}_START" if needs_space_before(s, start_replacement_range.start): start_replacement = " " + start_replacement if needs_space_after(s, start_replacement_range.end): start_replacement = start_replacement + " " replacements[start_replacement_range] = start_replacement end_replacement_range = CharacterRange(equation.content_end, equation.end) end_replacement = f"EQUATION_DEPTH_{equation.depth}_END" if needs_space_before(s, end_replacement_range.start): end_replacement = " " + end_replacement if needs_space_after(s, end_replacement_range.end): end_replacement = end_replacement + " " replacements[end_replacement_range] = end_replacement for replacement_range in sorted(replacements.keys(), key=lambda r: r.start, reverse=True): replacement_text = replacements[replacement_range] s = s.edit(replacement_range.start, replacement_range.end, replacement_text) return s
def test_serialize(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") assert edited.to_json() == { "value": "starter changed", "segments": [ { "initial": "starter ", "current": "starter ", "changed": False }, { "initial": "string", "current": "changed", "changed": True }, ], }
def test_delete(): s = JournaledString("starter string") edited = s.edit(7, 8, "") assert edited.to_json()["segments"] == [ { "initial": "starter", "current": "starter", "changed": False }, { "initial": " ", "current": "", "changed": True }, { "initial": "string", "current": "string", "changed": False }, ]
def test_substring_includes_initial_edges(): s = JournaledString("starter string") edited = s.edit(13, 14, "") sub = edited.substring(0, 13, greedy=False) assert sub.initial == "starter string"
def test_substring_at_boundaries(): s = JournaledString("starter string") edited = s.edit(5, 7, "ing") sub = edited.substring(5, 8) assert sub == "ing" assert sub.initial == "er"
def test_replace(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") assert s == "starter string" assert edited == "starter changed"
def test_preserve_initial_string(): s = JournaledString("starter string") edited = s.edit(8, 15, "changed") assert edited.initial == "starter string"
def test_replace_overlapping_span(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") edited = edited.edit(13, 15, "ing") assert edited == "starter changing"
def extract_plaintext(tex_path: str, tex: str) -> JournaledString: """ Extracts plaintext from TeX. Some TeX will be replaced (e.g., "\\\\" with "\n", equations with "<<equation-{id}>>"). Other TeX will be skipped (e.g., macros, braces, and brackets). The returned string is a 'JournaledString', which contains helper functions that allows the client to map from character offsets in the plaintext string back to character offsets in the original 'tex' string provided as input to this function. It's definitely not perfect: this extracted text will include text extracted from many command arguments, because we knew sometimes it would be wanted, and other times it wouldn't. Without more sophisticated macro processing, it's not possible to tell which arguments would be rendered as text and which wouldn't. For the use case of sentence boundary detection, spurious macro arguments are often okay to keep in the text as they only infrequently influence the detected boundaries. To support other natural language processing tasks, this extractor may need to be further refined. """ # Patterns of text that should be replaced with other plaintext. REPLACE_PATTERNS = { # Separate sections and captions text from the rest of the text. Pattern("section", r"\s*\\(?:sub)*section\*?\{([^}]*)\}\s*"): "\n\n\\1.\n\n", Pattern("paragraph", r"\s*\\paragraph*?\{([^}]*)\}\s*"): "\n\n\\1.\n\n", Pattern("caption", r"(.)(?=\\caption\*?\{)"): "\\1\n\n", # Replace commands for which colorizing the contents will lead to compilation failures. CITATION_PATTERN: "Citation (\\1)", Pattern("label", r"\\label\{([^}]+)\}"): "(Label \\1)", Pattern("ref", r"\\(?:page|c)?ref\{([^}]+)\}"): "(Ref \\1)", Pattern("glossary_term", r"\\gls(?:pl)?\*?\{([^}]+)\}"): "Glossary term (\\1)", # Replace TeX source spaces with semantic spacing. Pattern("linebreak_keep", r"(\\\\|\\linebreak)|\n(\s)*\n\s*"): "\n", Pattern("linebreak_ignore", r"\n"): " ", Pattern("space_macro", r"\\[ ,]"): " ", Pattern("tilde", r"~"): " ", # Replace characters that need to be escaped in TeX with unescaped text. Pattern("ampersand", r"\\&"): "&", } # Patterns of text the extractor should skip. SKIP_PATTERNS = [ # Include specific macros first, before the more general-purpose 'macro'. Pattern("input", r"\\(input|include)(\s+\S+|\{[^}]+\})"), # Many patterns below were written with reference to the LaTeX tokenizer in Python's # 'doctools' sources at: # http://svn.python.org/projects/doctools/converter/converter/tokenizer.py Pattern("environment_tags", r"\\(begin|end)\{[^}]*\}"), Pattern("macro", r"\\[a-zA-Z]+\*?[ \t]*"), RIGHT_BRACE, LEFT_BRACE, Pattern("left_bracket", r"\["), Pattern("right_bracket", r"\]"), # The following macros are a backslash followed by an ASCII symbol. This pattern was # written with reference to the command list at: # http://www.public.asu.edu/~rjansen/latexdoc/ltx-2.html # Pattern("symbol_macro", r"\\[@=><+'`-]"), ] # All math equations will be replaced in plaintext with the text "<<equation-{id}>>". # This ID should be the same as the one output by the equation pipeline. plaintext = JournaledString(tex) equation_extractor = EquationExtractor() equations = list(equation_extractor.parse(tex_path, tex)) for equation in reversed(equations): plaintext = plaintext.edit(equation.start, equation.end, f"<<equation-{equation.id_}>>") patterns = list(REPLACE_PATTERNS.keys()) + SKIP_PATTERNS scanner = scan_tex(str(plaintext), patterns, include_unmatched=True) # If the scanner yields a span of text, the span is either: # 1. a pattern to skip # 2. a pattern to replace # 3. some other uncommented text # If some span of text is not returned by the scanner, then it is a comment, # or some other text that the scanner ignores. That text should be removed from the # plain text as if it was a pattern to skip. # Iterate over matches in reverse so as not to mess up character offsets for # earlier matches when replacing TeX in the string. keep_after = len(plaintext) for match in reversed(list(scanner)): if match.end < keep_after: plaintext = plaintext.edit(match.end, keep_after, "") keep_after = match.end if match.pattern in REPLACE_PATTERNS: plaintext = plaintext.edit( match.start, match.end, re.sub(match.pattern.regex, REPLACE_PATTERNS[match.pattern], match.text), ) if match.pattern not in SKIP_PATTERNS: keep_after = match.start if keep_after > 0: plaintext = plaintext.edit(0, keep_after, "") # Finally, remove adjacent periods (which interfere with the pysbd sentence # segmenter), which may only be adjacent because the TeX grouping has been removed. # Do a lookahead for the last period (don't include it in the match) in order # to change as little of the original TeX as possible, to make it easier to map # back from the original period position (which will often occur at the end of # an extracted sentence) to its precise position in the original TeX. for m in reversed(list(re.finditer(r"[\s\.]+(?=\.)", str(plaintext)))): plaintext = plaintext.edit(m.start(), m.end(), "") return plaintext