def adjust_color_positions(entity: SerializableEntity) -> CharacterRange: """ Color commands sometimes introduce unwanted space when added right before or after an equation. One solution is to put color commands right inside the equation. """ term = cast(Term, entity) equation_extractor = EquationExtractor() equations = list(equation_extractor.parse(entity.tex_path, term.tex)) if len(equations) == 0: return CharacterRange(term.start, term.end) # If the term starts with an equation, move the coloring command inside the equation. adjusted_start = term.start first_equation = min(equations, key=lambda e: e.start) first_nonspace = re.search("\S", term.tex) if first_nonspace is not None: if first_nonspace.start(0) == first_equation.start: adjusted_start = term.start + first_equation.content_start # If the term ends with an equation, move the coloring command inside the equation. adjusted_end = term.end last_equation = max(equations, key=lambda e: e.end) last_nonspace = re.search("\S(?=\s*$)", term.tex) if last_nonspace is not None: if last_nonspace.end(0) == last_equation.end: adjusted_end = term.start + last_equation.content_end return CharacterRange(adjusted_start, adjusted_end)
def delimit_equations(s: JournaledString, equations: List[Equation]) -> JournaledString: " Replace delimiters around TeX equations with standardized delimiters. " replacements: Dict[CharacterRange, str] = {} def needs_space_before(s: JournaledString, character_index: int) -> bool: return character_index > 0 and not s[character_index - 1].isspace() def needs_space_after(s: JournaledString, character_index: int) -> bool: # If equation is used as a possessive (e.g., $x$'s), then don't add a space # after the equation, as it can interfere with pysbd's sentence splitting. While it # requires further investigation, it may be that putting an apostrophe after a space # makes pysbd think that the apostrophe is an opening single quote mark. if (character_index < len(s) - 1) and ( s[character_index:character_index + 2] == "'s"): return False if character_index < len(s) and not s[character_index].isspace(): return True return False for equation in equations: start_replacement_range = CharacterRange(equation.start, equation.content_start) start_replacement = f"EQUATION_DEPTH_{equation.depth}_START" if needs_space_before(s, start_replacement_range.start): start_replacement = " " + start_replacement if needs_space_after(s, start_replacement_range.end): start_replacement = start_replacement + " " replacements[start_replacement_range] = start_replacement end_replacement_range = CharacterRange(equation.content_end, equation.end) end_replacement = f"EQUATION_DEPTH_{equation.depth}_END" if needs_space_before(s, end_replacement_range.start): end_replacement = " " + end_replacement if needs_space_after(s, end_replacement_range.end): end_replacement = end_replacement + " " replacements[end_replacement_range] = end_replacement for replacement_range in sorted(replacements.keys(), key=lambda r: r.start, reverse=True): replacement_text = replacements[replacement_range] s = s.edit(replacement_range.start, replacement_range.end, replacement_text) return s
def get_color_positions(entity: SerializableEntity) -> CharacterRange: """ Override this when you want to set custom positions for inserting color commands. One example is for equations, where color commands should be inserted inside the bounds of the equation, rather than outside of it. """ return CharacterRange(start=entity.start, end=entity.end)
def adjust_color_positions(entity: SerializableEntity) -> CharacterRange: """ Sometimes, if you try to insert coloring commands at the boundary of where a symbol appears in TeX, it can cause errors. For example, it can be error-prone to put color commands... 1. Right outside of braces from the original TeX (e.g., "{x}") 2. Right after subscripts or superscripts (_, ^, \\sb, \\sp) 3. Between a dot or a hat and the symbol it modifies (e.g., "\\hat x") By putting color commands inside of braces, problems #2 and #3 can be avoided. For #1, and for a few other cases, this function adjusts the positions that coloring commands will be placed to avoid tricky TeX compilation gotchas. """ token = cast(SerializableToken, entity) equation_tex = token.equation token_string = equation_tex[token.relative_start:token.relative_end] token_start = token.start token_end = token.end # Adjust color commands to be on the inside of a group denoted by curly braces. if token_string.startswith("{") and token_string.endswith("}"): return CharacterRange(token.start + 1, token.end - 1) # If the token contains an ampersand, then probably this is a mistake, and it was # only included because the ampersand was replaced with space before the KaTeX parse, # and that space was included in this token in the parse. Remove the ampersand from the token. match = re.search(r"\s*&\s*$", token_string) if match is not None: token_end = token_start + match.start() # And coloring commands should never go outside the bounds of the equation. equation_start = token.start - token.relative_start equation_end = equation_start + len(equation_tex) start = max(token_start, equation_start) end = min(token_end, equation_end) return CharacterRange(start, end)
def get_token_character_ranges(text: str, tokens: List[str]) -> List[CharacterRange]: """ Extract start and end charcter positions for each token in featurized tokens """ ranges = [] current_position = 0 for token in tokens: start_index = text[current_position:].index(token) ranges.append( CharacterRange( current_position + start_index, current_position + start_index + len(token) - 1, )) current_position += len(token) + start_index return ranges
def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]: check_for_pysbd_reserved_characters(tex) # Extract plaintext from TeX. plaintext = extract_plaintext(tex_path, tex) # Segment the plaintext. Return offsets for each setence relative to the TeX input segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True) # As each sentence is scanned, keep track of what sections and environments the # sentence appears within. section_name = None in_figure = False in_table = False in_itemize = False # The pysbd module has several open bugs and issues which are addressed below. # As of 3/23/20 we know the module will fail in the following ways: # 1. pysbd will not break up the sentence when it starts with a punctuation mark or space. # ex: ". hello. world. hi." # sol: check for sentences being longer than 1000 characters. Also, see the # plaintext extraction function, which attempts to clean up the text so that # consecutive periods are removed before segmentation. # 2. pysbd uses reserved characters for splitting sentences # ex: see PYSBD_RESERVED_CHARACTERS list. # sol: throw a warning if the sentence contains any of these characters. sentence_ranges: List[CharacterRange] = [] sentence_start: Optional[int] = None for span in segmenter.segment(str(plaintext)): if sentence_start is None: # Strip leading whitespace from sentence. sentence_start = span.start + regex.search( r"^(\s*)", span.sent).end() # Don't detect a sentence boundary in the middle of a equation is_boundary_in_equation = regex.search( r"EQUATION_DEPTH_0_START(?!.*EQUATION_DEPTH_0_END)", str(plaintext[sentence_start:span.end]), flags=regex.DOTALL, ) if not is_boundary_in_equation: # Strip trailing whitespace from sentence. end = span.start + regex.search(r"(\s*)$", span.sent).start() sentence_ranges.append(CharacterRange(sentence_start, end)) sentence_start = None for i, sentence_range in enumerate(sentence_ranges): tex_start, tex_end = plaintext.initial_offsets( sentence_range.start, sentence_range.end) if tex_start is None or tex_end is None: logging.warning( # pylint: disable=logging-not-lazy "The span bounds (%d, %d) from pysbd for a sentence could not be mapped " + "back to character offsets in the LaTeX for an unknown reason.", sentence_range.start, sentence_range.end, ) continue sentence_tex = tex[tex_start:tex_end] # Save the sentence as a journaled string, which will allow the mapping of the cleaned # sentence text to the original TeX. sentence = plaintext.substring( sentence_range.start, sentence_range.end, # These truncation options are important for preserving the mapping from offsets in # the edited sentence to the initial offsets before the edits. include_truncated_left=False, include_truncated_right=False, ) if len(sentence) > 1000: logging.warning( # pylint: disable=logging-not-lazy "Exceptionally long sentence (length %d). This might indicate the sentence " + "extractor failed to properly split text into sentences.", len(sentence), ) # Extract TeX around sentence to understand the environment in which it appears context_tex = get_context(tex, tex_start, tex_end) # Detect features describing the context the sentence appears in (i.e., the section it's in, # or if it's in a figure, etc.) using regular expressions. section = regex.findall( r"\\(?:sub)*section[*]*\{[A-Za-z0-9 \{\}\\_.,:-]*\}", context_tex) abstract_begin = regex.findall(r"\\begin\{abstract\}", context_tex) abstract_end = regex.findall(r"\\end\{abstract\}", context_tex) table_begin = regex.findall(r"\\begin\{tabular\}", context_tex) table_end = regex.findall(r"\\end\{tabular\}", context_tex) figure_begin = regex.findall(r"\\begin\{figure[*]*\}", context_tex) figure_end = regex.findall(r"\\end\{figure[*]*\}", context_tex) itemize_begin = regex.findall(r"\\begin\{itemize[*]*\}", context_tex) itemize_end = regex.findall(r"\\end\{itemize[*]*\}", context_tex) cite = regex.findall( r"\\cite[A-Za-z0-9 \\_\[\].,:-]*\{[A-Za-z0-9 \\_.,:-]*\}", context_tex) url = regex.findall(r"\\url\{[A-Za-z0-9 \{\}/\\_.,:-]*\}", context_tex, overlapped=False) label = regex.findall(r"\\label\{[A-Za-z0-9 \\_.,:-]*\}", context_tex) ref = regex.findall(r"\\ref\{[A-Za-z0-9 \\_.,:-]*\}", context_tex) tex_macros = set( regex.findall( r"\\[A-Za-z0-9\\\[\]_.,:-]*[\{[A-Za-z0-9 \\_.,:-]*\}]*", context_tex)) # Save a list of other TeX macros that aren't captured by any of the other # categories: { any } - { section, label, ... }. other_tex_macros: List[str] = [] named_macros = { m for l in [ abstract_begin, abstract_end, table_begin, table_end, figure_begin, figure_end, itemize_begin, itemize_end, cite, ] for m in l } other_tex_macros = list(tex_macros - named_macros) # Save section name. if abstract_begin: section_name = "ABSTRACT" if abstract_end: section_name = None if section: section_name = extract_text_from_tex_group(section[0]) # Save information about whether a sentence is in a figure, table, or other environment. # TODO(dykang): considering using \label{} in table/figure to improve matching. if figure_begin: in_figure = True if figure_end: in_figure = False if table_begin: in_table = True if table_end: in_table = False if itemize_begin: in_itemize = True if itemize_end: in_itemize = False # Use heuristics about the surrounding text to determine whether or not this # sentence is valid. These heuristics have a number of limitations, and should be # replaced with more mature rules for detecting whether the sentence is indeed in # names section, the abstract, a figure, a table, etc. See documentation of its # limitations here: https://github.com/allenai/scholar-reader/issues/138#issue-678432430 validity_guess = all([ # Sentence should appear in a named section. (not self.from_named_sections_only) or section_name, # Sentence should not appear in a figure or table. # TODO(dykang, andrewhead): eventually, this should be rewritten to permit the # extraction of sentences from captions. not in_figure, not in_table, # If the sentence contained regular expression patterns for the start or end of # an environment, it's probably not a sentence, bur rather just TeX macros. not abstract_begin, not abstract_end, not section, not table_end, not figure_end, not itemize_begin, not itemize_end, ]) tokens = regex.split(r"[\s,;.!?()]+", str(sentence)) contains_common_english_word = any([ len(t) > 1 and t.lower() in self.english_words for t in tokens ]) ends_with_stop = bool(regex.search(r"[,.:;!?]\s*$", str(sentence))) is_clean = contains_common_english_word and ends_with_stop # Sanitize the text, replacing macros and unwanted TeX with text that will be easier # for the text processing algorithms to process. sanitized = sentence replace_patterns: List[Tuple[str, str]] = [] # Replace citations with "CITATION". for citation in cite: citation_text = extract_text_from_tex_group(citation) for key in citation_text.split(","): replace_patterns.append((key, "CITATION")) # Replace URLs with "URL". for url_item in url: url_text = extract_text_from_tex_group(url_item) replace_patterns.append((url_text, "URL")) # Replace references to text elements like figures and tables with a single # known word for each type of element. Currently depends on idiomatic patterns # for naming elements, like \ref{{fig,tab,sec,eq}:XXX}, to distinguish between # element types. Also, the code keeps the token ahead of the reference (e.g., # the word "Table" in "Table\ref{...}"), although it might duplicate the # information in the replaced label. for reference in ref: reference_text = extract_text_from_tex_group(reference) for r in reference_text.split(","): if reference.lower().startswith("tab"): replace_patterns.append((r, "TABLE")) if reference.lower().startswith("fig"): replace_patterns.append((r, "FIGURE")) if reference.lower().startswith("sec"): replace_patterns.append((r, "SECTION")) if reference.lower().startswith("eq"): replace_patterns.append((r, "EQUATION")) # Substitute patterns with replacements. for pattern, replacement in replace_patterns: if pattern == "": continue match_start = 0 while True: match_offset = sanitized.find(pattern, match_start) if match_offset == -1: break sanitized = sanitized.edit(match_offset, match_offset + len(pattern), replacement) match_start = match_offset + len(pattern) yield Sentence( id_=str(i), tex_path=tex_path, start=tex_start, end=tex_end, text=str(sentence), text_journal=sentence, sanitized=str(sanitized), sanitized_journal=sanitized, tex=sentence_tex, context_tex=context_tex, validity_guess=validity_guess, is_clean=is_clean, section_name=section_name, in_figure=in_figure, in_table=in_table, in_itemize=in_itemize, label=label, ref=ref, cite=cite, url=url, others=other_tex_macros, )
def process(self, item: Task) -> Iterator[EmbellishedSentence]: sentence = item.sentence equations = item.equations symbols = item.symbols pattern = r"<<equation-(\d+)>>" regex = re.compile(pattern) equation_spans: Dict[int, CharacterRange] = {} equation_indexes_reversed: List[int] = [] start = 0 while True: match = regex.search(sentence.sanitized, start) if match is None: break start = match.end() equation_index = int(match.group(1)) equation_indexes_reversed.insert(0, equation_index) equation_spans[equation_index] = CharacterRange( start=match.start(), end=match.end()) # Replace equations with more helpful representations. # Replace equations in reverse so that earlier replacements don't affect the character # offsets for the later replacements. with_symbol_and_formula_tags = sentence.sanitized_journal with_equation_tex = sentence.sanitized_journal with_symbol_tex = sentence.sanitized_journal with_bag_of_symbols = sentence.sanitized_journal legacy_definition_input = sentence.sanitized_journal for ei in equation_indexes_reversed: equation = equations[(sentence.tex_path, ei)] equation_symbols = symbols[(equation.tex_path, ei)] span = equation_spans[ei] # Replace equation with its TeX with_equation_tex = with_equation_tex.edit( span.start, span.end, f"[[FORMULA:{equation.content_tex}]]", ) # Replace equations with tags indicating whether each equation is # a symbol or a formula, and additionally with values for the symbols. is_symbol = count_top_level_symbols(equation_symbols) == 1 if is_symbol: with_symbol_and_formula_tags = with_symbol_and_formula_tags.edit( span.start, span.end, "[[SYMBOL]]") with_symbol_tex = with_symbol_tex.edit( span.start, span.end, f"[[SYMBOL({equation.tex.strip()})]]", ) else: with_symbol_and_formula_tags = with_symbol_and_formula_tags.edit( span.start, span.end, "[[FORMULA]]") with_symbol_tex = with_symbol_tex.edit(span.start, span.end, "[[FORMULA]]") # Replace each equation with a bag of the symbols that it contains. bag_of_symbols = {s.tex.strip() for s in equation_symbols} with_bag_of_symbols = with_bag_of_symbols.edit( span.start, span.end, f"[[FORMULA:{bag_of_symbols}]]", ) # Replace each equation with 'SYMBOL'. legacy_definition_input = legacy_definition_input.edit( span.start, span.end, "SYMBOL") yield EmbellishedSentence( id_=sentence.id_, tex_path=sentence.tex_path, start=sentence.start, end=sentence.end, tex=sentence.tex, context_tex=sentence.context_tex, text=sentence.text, text_journal=sentence.text_journal, sanitized=sentence.sanitized, sanitized_journal=sentence.sanitized_journal, validity_guess=sentence.validity_guess, section_name=sentence.section_name, in_figure=sentence.in_figure, in_table=sentence.in_table, in_itemize=sentence.in_itemize, label=sentence.label, ref=sentence.ref, cite=sentence.cite, url=sentence.url, others=sentence.others, with_symbol_and_formula_tags=str(with_symbol_and_formula_tags), with_symbol_and_formula_tags_journal=with_symbol_and_formula_tags, with_equation_tex=str(with_equation_tex), with_equation_tex_journal=with_equation_tex, with_symbol_tex=str(with_symbol_tex), with_symbol_tex_journal=with_symbol_tex, with_bag_of_symbols=str(with_bag_of_symbols), with_bag_of_symbols_journal=with_bag_of_symbols, legacy_definition_input=str(legacy_definition_input), legacy_definition_input_journal=legacy_definition_input, )
def adjust_color_positions(entity: SerializableEntity) -> CharacterRange: equation = cast(Equation, entity) return CharacterRange(equation.content_start, equation.content_end)
def colorize_entities( tex: str, entities: Sequence[SerializableEntity], insert_color_macros: bool = True, batch_size: Optional[int] = None, preset_hue: Optional[float] = None, when: Optional[ColorWhenFunc] = None, get_color_positions: Optional[ColorPositionsFunc] = None, ) -> Iterator[ColorizationBatch]: """ This function assumes that entities do not overlap. It is up to the caller to appropriately filter entities to those that do not overlap with each other. """ batch_size = min(batch_size, NUM_HUES) if batch_size is not None else NUM_HUES # Order entities from last-to-first so we can add color commands without messing with the offsets of # entities that haven't yet been colored. entities_reverse_order = sorted(entities, key=lambda e: e.start, reverse=True) hue_generator = generate_hues() entity_hues: List[Tuple[Hue, SerializableEntity]] = [] colorized_tex = tex item_index = 0 for e in entities_reverse_order: # Decide whether or not to color this entity if when is not None and not when(e): continue # Get a hue to color this entity if preset_hue is not None: hue = preset_hue else: hue = next(hue_generator) # Save a reference to this colorized entity to return to the caller entity_hues.insert(0, (hue, e)) # Determine what range of characters to color color_character_range = CharacterRange(e.start, e.end) if get_color_positions is not None: color_character_range = get_color_positions(e) colorized_tex = insert_color_in_tex(colorized_tex, hue, color_character_range.start, color_character_range.end) item_index += 1 # When the hues run out, notify caller that a batch has been finished. # Provide the caller with the colorized tex and list of colors. if item_index == batch_size: # Only insert color macros after all entities have been wrapped in color commands. # The color macros will likely go at the very beginning of the file, and therefore # if they are added before the color commands, they are likely to disrupt the character # positions at which we expect to find the entities. if insert_color_macros: colorized_tex = add_color_macros(colorized_tex) yield ColorizationBatch(colorized_tex, entity_hues) # Then reset the TeX so it is not colorized so we can start coloring with the # same hues without collisions. And clear the list of colors assigned. colorized_tex = tex entity_hues = [] # Reset the hue generator. hue_generator = generate_hues() # Reset the citation counter to 0. item_index = 0 # When finished coloring, yield any colorized entities that haven't yet bee yielded. if len(entity_hues) > 0: if insert_color_macros: colorized_tex = add_color_macros(colorized_tex) yield ColorizationBatch(colorized_tex, entity_hues)
def colorize_entities( tex: str, entities: Sequence[SerializableEntity], options: ColorizeOptions = ColorizeOptions(), ) -> ColorizedTex: """ This function assumes that entities do not overlap. It is up to the caller to appropriately filter entities to those that do not overlap with each other. """ insert_color_macros = options.insert_color_macros preset_hue = options.preset_hue adjust_color_positions = options.adjust_color_positions braces = options.braces # Filter entities to a list where no entity overlaps with any other entity. Those # that overlap will be returned as skipped entities. entities_filtered: List[SerializableEntity] = [] skipped = [] for entity in entities: if any([overlaps(entity, e) for e in entities_filtered]): skipped.append(entity) continue entities_filtered.append(entity) # Order entities from last-to-first so we can add color commands without messing with the offsets of # entities that haven't yet been colored. entities_reverse_order = sorted(entities_filtered, key=lambda e: e.start, reverse=True) hue_generator = generate_hues() entity_hues = {} colorized_tex = tex for e in entities_reverse_order: # Get a hue to color this entity if preset_hue is not None: hue = preset_hue else: hue = next(hue_generator) # Save a reference to this colorized entity to return to the caller entity_hues[e.id_] = hue # Determine what range of characters to color color_character_range = CharacterRange(e.start, e.end) if adjust_color_positions is not None: color_character_range = adjust_color_positions(e) colorized_tex = insert_color_in_tex( colorized_tex, e.id_, hue, color_character_range.start, color_character_range.end, braces=braces, ) # Only insert color macros after all entities have been wrapped in color commands. # The color macros will likely go at the very beginning of the file, and therefore # if they are added before the color commands, they are likely to disrupt the character # positions at which we expect to find the entities. if insert_color_macros: colorized_tex = add_color_macros(colorized_tex) return ColorizedTex(colorized_tex, entity_hues, skipped=skipped)