def test_many_replacements(): s = JournaledString("hello world") edited = s.edit(6, 11, "moon") edited = edited.edit(6, 8, "ballo") edited = edited.edit(1, 2, "i") edited = edited.edit(5, 5, ",") assert edited == "hillo, balloon"
def test_substring(): s = JournaledString("starter string") edited = s.edit(5, 7, "ing") sub = edited.substring(3, 10) assert sub == "rting s" assert sub.initial == "rter s" assert sub.initial_offsets(0, 1) == (0, 1) assert sub.initial_offsets(3, 4) == (2, 4)
def test_map_to_mutated_offsets(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") # Likewise (to the note in the above test), when mapping offsets from the # initial string to the mutated string, it's not clear where exactly offsets from the # old string map to in a replaced range. So conservatively map offsets in the # old range to entire segments in the mutated string. assert edited.current_offsets(0, 1) == (0, 1) assert edited.current_offsets(9, 9) == (8, 15)
def test_uncertainty_grows_with_overlapping_edits(): s = JournaledString("starter string") edited1 = s.edit(1, 7, "tarted") edited2 = edited1.edit(7, 13, " stron") edited3 = edited2.edit(5, 8, "ing_") # The third edit crosses the spans of the two edits before it. The string can't tell # where characters in the third edited span map---whether to the initial span on the # left side or the right side, so it conservatively suggests outer bounds at the # edges of all edits so far. assert edited3.initial_offsets(6, 7) == (1, 13)
def test_map_to_initial_offsets(): s = JournaledString("starter string") edited = s.edit(8, 15, "changed") assert edited.initial_offsets(0, 1) == (0, 1) # Offsets within replacement text map to the offset of the start # of the replaced span, because it's not clear which characters in # the replaced substring (if any) correspond to the initial string. assert edited.initial_offsets(8, 9) == (8, 14) assert edited.initial_offsets(9, 9) == (8, 14) assert edited.initial_offsets(15, 15) == (14, 14)
def test_serialize(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") assert edited.to_json() == { "value": "starter changed", "segments": [ { "initial": "starter ", "current": "starter ", "changed": False }, { "initial": "string", "current": "changed", "changed": True }, ], }
def test_delete(): s = JournaledString("starter string") edited = s.edit(7, 8, "") assert edited.to_json()["segments"] == [ { "initial": "starter", "current": "starter", "changed": False }, { "initial": " ", "current": "", "changed": True }, { "initial": "string", "current": "string", "changed": False }, ]
def delimit_equations(s: JournaledString, equations: List[Equation]) -> JournaledString: " Replace delimiters around TeX equations with standardized delimiters. " replacements: Dict[CharacterRange, str] = {} def needs_space_before(s: JournaledString, character_index: int) -> bool: return character_index > 0 and not s[character_index - 1].isspace() def needs_space_after(s: JournaledString, character_index: int) -> bool: # If equation is used as a possessive (e.g., $x$'s), then don't add a space # after the equation, as it can interfere with pysbd's sentence splitting. While it # requires further investigation, it may be that putting an apostrophe after a space # makes pysbd think that the apostrophe is an opening single quote mark. if (character_index < len(s) - 1) and ( s[character_index:character_index + 2] == "'s"): return False if character_index < len(s) and not s[character_index].isspace(): return True return False for equation in equations: start_replacement_range = CharacterRange(equation.start, equation.content_start) start_replacement = f"EQUATION_DEPTH_{equation.depth}_START" if needs_space_before(s, start_replacement_range.start): start_replacement = " " + start_replacement if needs_space_after(s, start_replacement_range.end): start_replacement = start_replacement + " " replacements[start_replacement_range] = start_replacement end_replacement_range = CharacterRange(equation.content_end, equation.end) end_replacement = f"EQUATION_DEPTH_{equation.depth}_END" if needs_space_before(s, end_replacement_range.start): end_replacement = " " + end_replacement if needs_space_after(s, end_replacement_range.end): end_replacement = end_replacement + " " replacements[end_replacement_range] = end_replacement for replacement_range in sorted(replacements.keys(), key=lambda r: r.start, reverse=True): replacement_text = replacements[replacement_range] s = s.edit(replacement_range.start, replacement_range.end, replacement_text) return s
def test_load(): s = JournaledString.from_json({ "value": "starter changed", "segments": [ { "initial": "starter ", "current": "starter ", "changed": False }, { "initial": "string", "current": "changed", "changed": True }, ], }) assert s.initial == "starter string" assert s == "starter changed" assert s.initial_offsets(0, 1) == (0, 1) assert s.initial_offsets(9, 9) == (8, 14)
def load_from_csv( csv_path: Path, D: Type[Dataclass], encoding: str = "utf-8", ) -> Iterator[Dataclass]: """ Load data from CSV file at 'csv_path', returning an iterator over objects of type 'D'. This method assumes that the CSV file was written by 'append_to_csv'. Key to this assumption is that each row of the CSV file has all of the data needed to populate an object of type 'D'. The headers in the CSV file must exactly match the property names of 'D'. There can, however, be extra columns in the CSV file that don't correspond to the dataclass. """ with open(csv_path, encoding=encoding, newline="") as csv_file: reader = csv.DictReader(csv_file, quoting=csv.QUOTE_MINIMAL) for row in reader: data: Dict[str, Any] = {} # Transfer data from the row into a dictionary of arguments. By only including the # fields for D, we skip over columns that can't be used to initialize D. At the # same time, cast each column to the intended data type. invalid = False for field in dataclasses.fields(D): try: type_ = field.type is_optional = False # If the field is optional, check for the special null value. If it's not # present, determine which primitive type the value should be cast to. See # note for List[str] for cautions about using dynamic type-checks like this # for mypy types like Optional types. if type_ in [ Optional[bool], Optional[int], Optional[float], Optional[str], ]: is_optional = True type_ = ( bool if type_ == Optional[bool] else int if type_ == Optional[int] else float if type_ == Optional[float] else str if type_ == Optional[str] else Type[Any] ) if is_optional and row[field.name] == "<!NULL!>": data[field.name] = None # Journaled strings should be loaded from JSON. elif type_ == JournaledString: data[field.name] = JournaledString.from_json( json.loads(row[field.name]) ) # Rules for reading Booleans. Support casting of '0' and '1' or the strings # 'True' and 'False'. 'True' and 'False' are the default output of CSV writer. elif type_ == bool: data[field.name] = bool(ast.literal_eval(row[field.name])) # Handle other primitive values. elif type_ in [int, float, str]: data[field.name] = type_(row[field.name]) # XXX(andrewhead): It's not guaranteed that type-checks like this one will work # as the 'typing' library evolves. At the time of writing, it looked like calls # to the '__eq__' method of classes that extend GenericMeta (like List, Tuple) # should work (i.e., comparing a type with '=='). See: # https://github.com/python/typing/blob/c85016137eab6d0784b76252460235638087f468/src/typing.py#L1093-L1098 # See also this test for equality in the Tuple class. # https://github.com/python/typing/blob/c85016137eab6d0784b76252460235638087f468/src/test_typing.py#L400 # If at some point this comparison stops working, perhaps we can define a custom # type for types of interest (like StrList) and compare the ID of the newly defined type. elif field.type == List[str]: data[field.name] = ast.literal_eval(row[field.name]) else: logging.warning( # pylint: disable=logging-not-lazy "Could not decode data for field %s of type %s . " + "This may mean that the rules for reading CSV files need to " + "be extended to support this data type.", field.name, field.type, ) except (ValueError, json.JSONDecodeError) as e: logging.warning( # pylint: disable=logging-not-lazy "Could not read value '%s' for field '%s' of expected type %s from CSV. " + "Error: %s. This row will be skipped. This value probably had an " + "invalid type when the data for the row was created.", row[field.name], field.name, field.type, e, ) invalid = True if not invalid: yield D(**data) # type: ignore
def test_substring_includes_initial_edges(): s = JournaledString("starter string") edited = s.edit(13, 14, "") sub = edited.substring(0, 13, greedy=False) assert sub.initial == "starter string"
def test_substring_at_boundaries(): s = JournaledString("starter string") edited = s.edit(5, 7, "ing") sub = edited.substring(5, 8) assert sub == "ing" assert sub.initial == "er"
def test_replace(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") assert s == "starter string" assert edited == "starter changed"
def test_preserve_initial_string(): s = JournaledString("starter string") edited = s.edit(8, 15, "changed") assert edited.initial == "starter string"
def test_replace_overlapping_span(): s = JournaledString("starter string") edited = s.edit(8, 14, "changed") edited = edited.edit(13, 15, "ing") assert edited == "starter changing"
def extract_plaintext(tex_path: str, tex: str) -> JournaledString: """ Extracts plaintext from TeX. Some TeX will be replaced (e.g., "\\\\" with "\n", equations with "<<equation-{id}>>"). Other TeX will be skipped (e.g., macros, braces, and brackets). The returned string is a 'JournaledString', which contains helper functions that allows the client to map from character offsets in the plaintext string back to character offsets in the original 'tex' string provided as input to this function. It's definitely not perfect: this extracted text will include text extracted from many command arguments, because we knew sometimes it would be wanted, and other times it wouldn't. Without more sophisticated macro processing, it's not possible to tell which arguments would be rendered as text and which wouldn't. For the use case of sentence boundary detection, spurious macro arguments are often okay to keep in the text as they only infrequently influence the detected boundaries. To support other natural language processing tasks, this extractor may need to be further refined. """ # Patterns of text that should be replaced with other plaintext. REPLACE_PATTERNS = { # Separate sections and captions text from the rest of the text. Pattern("section", r"\s*\\(?:sub)*section\*?\{([^}]*)\}\s*"): "\n\n\\1.\n\n", Pattern("paragraph", r"\s*\\paragraph*?\{([^}]*)\}\s*"): "\n\n\\1.\n\n", Pattern("caption", r"(.)(?=\\caption\*?\{)"): "\\1\n\n", # Replace commands for which colorizing the contents will lead to compilation failures. CITATION_PATTERN: "Citation (\\1)", Pattern("label", r"\\label\{([^}]+)\}"): "(Label \\1)", Pattern("ref", r"\\(?:page|c)?ref\{([^}]+)\}"): "(Ref \\1)", Pattern("glossary_term", r"\\gls(?:pl)?\*?\{([^}]+)\}"): "Glossary term (\\1)", # Replace TeX source spaces with semantic spacing. Pattern("linebreak_keep", r"(\\\\|\\linebreak)|\n(\s)*\n\s*"): "\n", Pattern("linebreak_ignore", r"\n"): " ", Pattern("space_macro", r"\\[ ,]"): " ", Pattern("tilde", r"~"): " ", # Replace characters that need to be escaped in TeX with unescaped text. Pattern("ampersand", r"\\&"): "&", } # Patterns of text the extractor should skip. SKIP_PATTERNS = [ # Include specific macros first, before the more general-purpose 'macro'. Pattern("input", r"\\(input|include)(\s+\S+|\{[^}]+\})"), # Many patterns below were written with reference to the LaTeX tokenizer in Python's # 'doctools' sources at: # http://svn.python.org/projects/doctools/converter/converter/tokenizer.py Pattern("environment_tags", r"\\(begin|end)\{[^}]*\}"), Pattern("macro", r"\\[a-zA-Z]+\*?[ \t]*"), RIGHT_BRACE, LEFT_BRACE, Pattern("left_bracket", r"\["), Pattern("right_bracket", r"\]"), # The following macros are a backslash followed by an ASCII symbol. This pattern was # written with reference to the command list at: # http://www.public.asu.edu/~rjansen/latexdoc/ltx-2.html # Pattern("symbol_macro", r"\\[@=><+'`-]"), ] # All math equations will be replaced in plaintext with the text "<<equation-{id}>>". # This ID should be the same as the one output by the equation pipeline. plaintext = JournaledString(tex) equation_extractor = EquationExtractor() equations = list(equation_extractor.parse(tex_path, tex)) for equation in reversed(equations): plaintext = plaintext.edit(equation.start, equation.end, f"<<equation-{equation.id_}>>") patterns = list(REPLACE_PATTERNS.keys()) + SKIP_PATTERNS scanner = scan_tex(str(plaintext), patterns, include_unmatched=True) # If the scanner yields a span of text, the span is either: # 1. a pattern to skip # 2. a pattern to replace # 3. some other uncommented text # If some span of text is not returned by the scanner, then it is a comment, # or some other text that the scanner ignores. That text should be removed from the # plain text as if it was a pattern to skip. # Iterate over matches in reverse so as not to mess up character offsets for # earlier matches when replacing TeX in the string. keep_after = len(plaintext) for match in reversed(list(scanner)): if match.end < keep_after: plaintext = plaintext.edit(match.end, keep_after, "") keep_after = match.end if match.pattern in REPLACE_PATTERNS: plaintext = plaintext.edit( match.start, match.end, re.sub(match.pattern.regex, REPLACE_PATTERNS[match.pattern], match.text), ) if match.pattern not in SKIP_PATTERNS: keep_after = match.start if keep_after > 0: plaintext = plaintext.edit(0, keep_after, "") # Finally, remove adjacent periods (which interfere with the pysbd sentence # segmenter), which may only be adjacent because the TeX grouping has been removed. # Do a lookahead for the last period (don't include it in the match) in order # to change as little of the original TeX as possible, to make it easier to map # back from the original period position (which will often occur at the end of # an extracted sentence) to its precise position in the original TeX. for m in reversed(list(re.finditer(r"[\s\.]+(?=\.)", str(plaintext)))): plaintext = plaintext.edit(m.start(), m.end(), "") return plaintext