コード例 #1
0
def adjust_color_positions(entity: SerializableEntity) -> CharacterRange:
    """
    Color commands sometimes introduce unwanted space when added right before or after an equation.
    One solution is to put color commands right inside the equation.
    """

    term = cast(Term, entity)

    equation_extractor = EquationExtractor()
    equations = list(equation_extractor.parse(entity.tex_path, term.tex))
    if len(equations) == 0:
        return CharacterRange(term.start, term.end)

    # If the term starts with an equation, move the coloring command inside the equation.
    adjusted_start = term.start
    first_equation = min(equations, key=lambda e: e.start)
    first_nonspace = re.search("\S", term.tex)
    if first_nonspace is not None:
        if first_nonspace.start(0) == first_equation.start:
            adjusted_start = term.start + first_equation.content_start

    # If the term ends with an equation, move the coloring command inside the equation.
    adjusted_end = term.end
    last_equation = max(equations, key=lambda e: e.end)
    last_nonspace = re.search("\S(?=\s*$)", term.tex)
    if last_nonspace is not None:
        if last_nonspace.end(0) == last_equation.end:
            adjusted_end = term.start + last_equation.content_end

    return CharacterRange(adjusted_start, adjusted_end)
コード例 #2
0
def delimit_equations(s: JournaledString,
                      equations: List[Equation]) -> JournaledString:
    " Replace delimiters around TeX equations with standardized delimiters. "

    replacements: Dict[CharacterRange, str] = {}

    def needs_space_before(s: JournaledString, character_index: int) -> bool:
        return character_index > 0 and not s[character_index - 1].isspace()

    def needs_space_after(s: JournaledString, character_index: int) -> bool:
        # If equation is used as a possessive (e.g., $x$'s), then don't add a space
        # after the equation, as it can interfere with pysbd's sentence splitting. While it
        # requires further investigation, it may be that putting an apostrophe after a space
        # makes pysbd think that the apostrophe is an opening single quote mark.
        if (character_index < len(s) - 1) and (
                s[character_index:character_index + 2] == "'s"):
            return False
        if character_index < len(s) and not s[character_index].isspace():
            return True
        return False

    for equation in equations:

        start_replacement_range = CharacterRange(equation.start,
                                                 equation.content_start)
        start_replacement = f"EQUATION_DEPTH_{equation.depth}_START"
        if needs_space_before(s, start_replacement_range.start):
            start_replacement = " " + start_replacement
        if needs_space_after(s, start_replacement_range.end):
            start_replacement = start_replacement + " "
        replacements[start_replacement_range] = start_replacement

        end_replacement_range = CharacterRange(equation.content_end,
                                               equation.end)
        end_replacement = f"EQUATION_DEPTH_{equation.depth}_END"
        if needs_space_before(s, end_replacement_range.start):
            end_replacement = " " + end_replacement
        if needs_space_after(s, end_replacement_range.end):
            end_replacement = end_replacement + " "
        replacements[end_replacement_range] = end_replacement

    for replacement_range in sorted(replacements.keys(),
                                    key=lambda r: r.start,
                                    reverse=True):
        replacement_text = replacements[replacement_range]
        s = s.edit(replacement_range.start, replacement_range.end,
                   replacement_text)

    return s
コード例 #3
0
 def get_color_positions(entity: SerializableEntity) -> CharacterRange:
     """
     Override this when you want to set custom positions for inserting color commands. One
     example is for equations, where color commands should be inserted inside the bounds of
     the equation, rather than outside of it.
     """
     return CharacterRange(start=entity.start, end=entity.end)
コード例 #4
0
def adjust_color_positions(entity: SerializableEntity) -> CharacterRange:
    """
    Sometimes, if you try to insert coloring commands at the boundary of where a symbol appears
    in TeX, it can cause errors. For example, it can be error-prone to put color commands...

    1. Right outside of braces from the original TeX (e.g., "{x}")
    2. Right after subscripts or superscripts (_, ^, \\sb, \\sp)
    3. Between a dot or a hat and the symbol it modifies (e.g., "\\hat x")

    By putting color commands inside of braces, problems #2 and #3 can be avoided. For #1,
    and for a few other cases, this function adjusts the positions that coloring commands
    will be placed to avoid tricky TeX compilation gotchas.
    """
    token = cast(SerializableToken, entity)
    equation_tex = token.equation
    token_string = equation_tex[token.relative_start:token.relative_end]

    token_start = token.start
    token_end = token.end

    # Adjust color commands to be on the inside of a group denoted by curly braces.
    if token_string.startswith("{") and token_string.endswith("}"):
        return CharacterRange(token.start + 1, token.end - 1)

    # If the token contains an ampersand, then probably this is a mistake, and it was
    # only included because the ampersand was replaced with space before the KaTeX parse,
    # and that space was included in this token in the parse. Remove the ampersand from the token.
    match = re.search(r"\s*&\s*$", token_string)
    if match is not None:
        token_end = token_start + match.start()

    # And coloring commands should never go outside the bounds of the equation.
    equation_start = token.start - token.relative_start
    equation_end = equation_start + len(equation_tex)
    start = max(token_start, equation_start)
    end = min(token_end, equation_end)
    return CharacterRange(start, end)
コード例 #5
0
def get_token_character_ranges(text: str,
                               tokens: List[str]) -> List[CharacterRange]:
    """
    Extract start and end charcter positions for each token in featurized tokens
    """
    ranges = []
    current_position = 0
    for token in tokens:
        start_index = text[current_position:].index(token)
        ranges.append(
            CharacterRange(
                current_position + start_index,
                current_position + start_index + len(token) - 1,
            ))
        current_position += len(token) + start_index
    return ranges
コード例 #6
0
ファイル: extractor.py プロジェクト: z-314/scholarphi
    def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
        check_for_pysbd_reserved_characters(tex)

        # Extract plaintext from TeX.
        plaintext = extract_plaintext(tex_path, tex)

        # Segment the plaintext. Return offsets for each setence relative to the TeX input
        segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)

        # As each sentence is scanned, keep track of what sections and environments the
        # sentence appears within.
        section_name = None
        in_figure = False
        in_table = False
        in_itemize = False

        # The pysbd module has several open bugs and issues which are addressed below.
        # As of 3/23/20 we know the module will fail in the following ways:
        # 1. pysbd will not break up the sentence when it starts with a punctuation mark or space.
        #    ex: ". hello. world. hi."
        #    sol: check for sentences being longer than 1000 characters. Also, see the
        #         plaintext extraction function, which attempts to clean up the text so that
        #         consecutive periods are removed before segmentation.
        # 2. pysbd uses reserved characters for splitting sentences
        #    ex: see PYSBD_RESERVED_CHARACTERS list.
        #    sol: throw a warning if the sentence contains any of these characters.
        sentence_ranges: List[CharacterRange] = []
        sentence_start: Optional[int] = None

        for span in segmenter.segment(str(plaintext)):
            if sentence_start is None:
                # Strip leading whitespace from sentence.
                sentence_start = span.start + regex.search(
                    r"^(\s*)", span.sent).end()

            # Don't detect a sentence boundary in the middle of a equation
            is_boundary_in_equation = regex.search(
                r"EQUATION_DEPTH_0_START(?!.*EQUATION_DEPTH_0_END)",
                str(plaintext[sentence_start:span.end]),
                flags=regex.DOTALL,
            )
            if not is_boundary_in_equation:
                # Strip trailing whitespace from sentence.
                end = span.start + regex.search(r"(\s*)$", span.sent).start()
                sentence_ranges.append(CharacterRange(sentence_start, end))
                sentence_start = None

        for i, sentence_range in enumerate(sentence_ranges):
            tex_start, tex_end = plaintext.initial_offsets(
                sentence_range.start, sentence_range.end)
            if tex_start is None or tex_end is None:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "The span bounds (%d, %d) from pysbd for a sentence could not be mapped "
                    +
                    "back to character offsets in the LaTeX for an unknown reason.",
                    sentence_range.start,
                    sentence_range.end,
                )
                continue

            sentence_tex = tex[tex_start:tex_end]

            # Save the sentence as a journaled string, which will allow the mapping of the cleaned
            # sentence text to the original TeX.
            sentence = plaintext.substring(
                sentence_range.start,
                sentence_range.end,
                # These truncation options are important for preserving the mapping from offsets in
                # the edited sentence to the initial offsets before the edits.
                include_truncated_left=False,
                include_truncated_right=False,
            )
            if len(sentence) > 1000:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Exceptionally long sentence (length %d). This might indicate the sentence "
                    +
                    "extractor failed to properly split text into sentences.",
                    len(sentence),
                )

            # Extract TeX around sentence to understand the environment in which it appears
            context_tex = get_context(tex, tex_start, tex_end)

            # Detect features describing the context the sentence appears in (i.e., the section it's in,
            # or if it's in a figure, etc.) using regular expressions.
            section = regex.findall(
                r"\\(?:sub)*section[*]*\{[A-Za-z0-9 \{\}\\_.,:-]*\}",
                context_tex)
            abstract_begin = regex.findall(r"\\begin\{abstract\}", context_tex)
            abstract_end = regex.findall(r"\\end\{abstract\}", context_tex)
            table_begin = regex.findall(r"\\begin\{tabular\}", context_tex)
            table_end = regex.findall(r"\\end\{tabular\}", context_tex)
            figure_begin = regex.findall(r"\\begin\{figure[*]*\}", context_tex)
            figure_end = regex.findall(r"\\end\{figure[*]*\}", context_tex)
            itemize_begin = regex.findall(r"\\begin\{itemize[*]*\}",
                                          context_tex)
            itemize_end = regex.findall(r"\\end\{itemize[*]*\}", context_tex)
            cite = regex.findall(
                r"\\cite[A-Za-z0-9 \\_\[\].,:-]*\{[A-Za-z0-9 \\_.,:-]*\}",
                context_tex)
            url = regex.findall(r"\\url\{[A-Za-z0-9 \{\}/\\_.,:-]*\}",
                                context_tex,
                                overlapped=False)
            label = regex.findall(r"\\label\{[A-Za-z0-9 \\_.,:-]*\}",
                                  context_tex)
            ref = regex.findall(r"\\ref\{[A-Za-z0-9 \\_.,:-]*\}", context_tex)
            tex_macros = set(
                regex.findall(
                    r"\\[A-Za-z0-9\\\[\]_.,:-]*[\{[A-Za-z0-9 \\_.,:-]*\}]*",
                    context_tex))

            # Save a list of other TeX macros that aren't captured by any of the other
            # categories: { any } - { section, label, ... }.
            other_tex_macros: List[str] = []
            named_macros = {
                m
                for l in [
                    abstract_begin,
                    abstract_end,
                    table_begin,
                    table_end,
                    figure_begin,
                    figure_end,
                    itemize_begin,
                    itemize_end,
                    cite,
                ] for m in l
            }
            other_tex_macros = list(tex_macros - named_macros)

            # Save section name.
            if abstract_begin:
                section_name = "ABSTRACT"
            if abstract_end:
                section_name = None
            if section:
                section_name = extract_text_from_tex_group(section[0])

            # Save information about whether a sentence is in a figure, table, or other environment.
            # TODO(dykang): considering using \label{} in table/figure to improve matching.
            if figure_begin:
                in_figure = True
            if figure_end:
                in_figure = False
            if table_begin:
                in_table = True
            if table_end:
                in_table = False
            if itemize_begin:
                in_itemize = True
            if itemize_end:
                in_itemize = False

            # Use heuristics about the surrounding text to determine whether or not this
            # sentence is valid. These heuristics have a number of limitations, and should be
            # replaced with more mature rules for detecting whether the sentence is indeed in
            # names section, the abstract, a figure, a table, etc. See documentation of its
            # limitations here: https://github.com/allenai/scholar-reader/issues/138#issue-678432430
            validity_guess = all([
                # Sentence should appear in a named section.
                (not self.from_named_sections_only) or section_name,
                # Sentence should not appear in a figure or table.
                # TODO(dykang, andrewhead): eventually, this should be rewritten to permit the
                # extraction of sentences from captions.
                not in_figure,
                not in_table,
                # If the sentence contained regular expression patterns for the start or end of
                # an environment, it's probably not a sentence, bur rather just TeX macros.
                not abstract_begin,
                not abstract_end,
                not section,
                not table_end,
                not figure_end,
                not itemize_begin,
                not itemize_end,
            ])

            tokens = regex.split(r"[\s,;.!?()]+", str(sentence))
            contains_common_english_word = any([
                len(t) > 1 and t.lower() in self.english_words for t in tokens
            ])
            ends_with_stop = bool(regex.search(r"[,.:;!?]\s*$", str(sentence)))
            is_clean = contains_common_english_word and ends_with_stop

            # Sanitize the text, replacing macros and unwanted TeX with text that will be easier
            # for the text processing algorithms to process.
            sanitized = sentence
            replace_patterns: List[Tuple[str, str]] = []

            # Replace citations with "CITATION".
            for citation in cite:
                citation_text = extract_text_from_tex_group(citation)
                for key in citation_text.split(","):
                    replace_patterns.append((key, "CITATION"))

            # Replace URLs with "URL".
            for url_item in url:
                url_text = extract_text_from_tex_group(url_item)
                replace_patterns.append((url_text, "URL"))

            # Replace references to text elements like figures and tables with a single
            # known word for each type of element. Currently depends on idiomatic patterns
            # for naming elements, like \ref{{fig,tab,sec,eq}:XXX}, to distinguish between
            # element types. Also, the code keeps the token ahead of the reference (e.g.,
            # the word "Table" in "Table\ref{...}"), although it might duplicate the
            # information in the replaced label.
            for reference in ref:
                reference_text = extract_text_from_tex_group(reference)
                for r in reference_text.split(","):
                    if reference.lower().startswith("tab"):
                        replace_patterns.append((r, "TABLE"))
                    if reference.lower().startswith("fig"):
                        replace_patterns.append((r, "FIGURE"))
                    if reference.lower().startswith("sec"):
                        replace_patterns.append((r, "SECTION"))
                    if reference.lower().startswith("eq"):
                        replace_patterns.append((r, "EQUATION"))

            # Substitute patterns with replacements.
            for pattern, replacement in replace_patterns:
                if pattern == "":
                    continue
                match_start = 0
                while True:
                    match_offset = sanitized.find(pattern, match_start)
                    if match_offset == -1:
                        break
                    sanitized = sanitized.edit(match_offset,
                                               match_offset + len(pattern),
                                               replacement)
                    match_start = match_offset + len(pattern)

            yield Sentence(
                id_=str(i),
                tex_path=tex_path,
                start=tex_start,
                end=tex_end,
                text=str(sentence),
                text_journal=sentence,
                sanitized=str(sanitized),
                sanitized_journal=sanitized,
                tex=sentence_tex,
                context_tex=context_tex,
                validity_guess=validity_guess,
                is_clean=is_clean,
                section_name=section_name,
                in_figure=in_figure,
                in_table=in_table,
                in_itemize=in_itemize,
                label=label,
                ref=ref,
                cite=cite,
                url=url,
                others=other_tex_macros,
            )
コード例 #7
0
    def process(self, item: Task) -> Iterator[EmbellishedSentence]:
        sentence = item.sentence
        equations = item.equations
        symbols = item.symbols

        pattern = r"<<equation-(\d+)>>"
        regex = re.compile(pattern)

        equation_spans: Dict[int, CharacterRange] = {}
        equation_indexes_reversed: List[int] = []
        start = 0
        while True:
            match = regex.search(sentence.sanitized, start)
            if match is None:
                break
            start = match.end()
            equation_index = int(match.group(1))
            equation_indexes_reversed.insert(0, equation_index)
            equation_spans[equation_index] = CharacterRange(
                start=match.start(), end=match.end())

        # Replace equations with more helpful representations.
        # Replace equations in reverse so that earlier replacements don't affect the character
        # offsets for the later replacements.
        with_symbol_and_formula_tags = sentence.sanitized_journal
        with_equation_tex = sentence.sanitized_journal
        with_symbol_tex = sentence.sanitized_journal
        with_bag_of_symbols = sentence.sanitized_journal
        legacy_definition_input = sentence.sanitized_journal

        for ei in equation_indexes_reversed:

            equation = equations[(sentence.tex_path, ei)]
            equation_symbols = symbols[(equation.tex_path, ei)]
            span = equation_spans[ei]

            # Replace equation with its TeX
            with_equation_tex = with_equation_tex.edit(
                span.start,
                span.end,
                f"[[FORMULA:{equation.content_tex}]]",
            )

            # Replace equations with tags indicating whether each equation is
            # a symbol or a formula, and additionally with values for the symbols.
            is_symbol = count_top_level_symbols(equation_symbols) == 1
            if is_symbol:
                with_symbol_and_formula_tags = with_symbol_and_formula_tags.edit(
                    span.start, span.end, "[[SYMBOL]]")
                with_symbol_tex = with_symbol_tex.edit(
                    span.start,
                    span.end,
                    f"[[SYMBOL({equation.tex.strip()})]]",
                )
            else:
                with_symbol_and_formula_tags = with_symbol_and_formula_tags.edit(
                    span.start, span.end, "[[FORMULA]]")
                with_symbol_tex = with_symbol_tex.edit(span.start, span.end,
                                                       "[[FORMULA]]")

            # Replace each equation with a bag of the symbols that it contains.
            bag_of_symbols = {s.tex.strip() for s in equation_symbols}
            with_bag_of_symbols = with_bag_of_symbols.edit(
                span.start,
                span.end,
                f"[[FORMULA:{bag_of_symbols}]]",
            )

            # Replace each equation with 'SYMBOL'.
            legacy_definition_input = legacy_definition_input.edit(
                span.start, span.end, "SYMBOL")

        yield EmbellishedSentence(
            id_=sentence.id_,
            tex_path=sentence.tex_path,
            start=sentence.start,
            end=sentence.end,
            tex=sentence.tex,
            context_tex=sentence.context_tex,
            text=sentence.text,
            text_journal=sentence.text_journal,
            sanitized=sentence.sanitized,
            sanitized_journal=sentence.sanitized_journal,
            validity_guess=sentence.validity_guess,
            section_name=sentence.section_name,
            in_figure=sentence.in_figure,
            in_table=sentence.in_table,
            in_itemize=sentence.in_itemize,
            label=sentence.label,
            ref=sentence.ref,
            cite=sentence.cite,
            url=sentence.url,
            others=sentence.others,
            with_symbol_and_formula_tags=str(with_symbol_and_formula_tags),
            with_symbol_and_formula_tags_journal=with_symbol_and_formula_tags,
            with_equation_tex=str(with_equation_tex),
            with_equation_tex_journal=with_equation_tex,
            with_symbol_tex=str(with_symbol_tex),
            with_symbol_tex_journal=with_symbol_tex,
            with_bag_of_symbols=str(with_bag_of_symbols),
            with_bag_of_symbols_journal=with_bag_of_symbols,
            legacy_definition_input=str(legacy_definition_input),
            legacy_definition_input_journal=legacy_definition_input,
        )
コード例 #8
0
def adjust_color_positions(entity: SerializableEntity) -> CharacterRange:
    equation = cast(Equation, entity)
    return CharacterRange(equation.content_start, equation.content_end)
コード例 #9
0
def colorize_entities(
    tex: str,
    entities: Sequence[SerializableEntity],
    insert_color_macros: bool = True,
    batch_size: Optional[int] = None,
    preset_hue: Optional[float] = None,
    when: Optional[ColorWhenFunc] = None,
    get_color_positions: Optional[ColorPositionsFunc] = None,
) -> Iterator[ColorizationBatch]:
    """
    This function assumes that entities do not overlap. It is up to the caller to appropriately
    filter entities to those that do not overlap with each other.
    """

    batch_size = min(batch_size,
                     NUM_HUES) if batch_size is not None else NUM_HUES

    # Order entities from last-to-first so we can add color commands without messing with the offsets of
    # entities that haven't yet been colored.
    entities_reverse_order = sorted(entities,
                                    key=lambda e: e.start,
                                    reverse=True)

    hue_generator = generate_hues()
    entity_hues: List[Tuple[Hue, SerializableEntity]] = []

    colorized_tex = tex
    item_index = 0
    for e in entities_reverse_order:

        # Decide whether or not to color this entity
        if when is not None and not when(e):
            continue

        # Get a hue to color this entity
        if preset_hue is not None:
            hue = preset_hue
        else:
            hue = next(hue_generator)

        # Save a reference to this colorized entity to return to the caller
        entity_hues.insert(0, (hue, e))

        # Determine what range of characters to color
        color_character_range = CharacterRange(e.start, e.end)
        if get_color_positions is not None:
            color_character_range = get_color_positions(e)
        colorized_tex = insert_color_in_tex(colorized_tex, hue,
                                            color_character_range.start,
                                            color_character_range.end)

        item_index += 1

        # When the hues run out, notify caller that a batch has been finished.
        # Provide the caller with the colorized tex and list of colors.
        if item_index == batch_size:
            # Only insert color macros after all entities have been wrapped in color commands.
            # The color macros will likely go at the very beginning of the file, and therefore
            # if they are added before the color commands, they are likely to disrupt the character
            # positions at which we expect to find the entities.
            if insert_color_macros:
                colorized_tex = add_color_macros(colorized_tex)

            yield ColorizationBatch(colorized_tex, entity_hues)

            # Then reset the TeX so it is not colorized so we can start coloring with the
            # same hues without collisions. And clear the list of colors assigned.
            colorized_tex = tex
            entity_hues = []

            # Reset the hue generator.
            hue_generator = generate_hues()

            # Reset the citation counter to 0.
            item_index = 0

    # When finished coloring, yield any colorized entities that haven't yet bee yielded.
    if len(entity_hues) > 0:
        if insert_color_macros:
            colorized_tex = add_color_macros(colorized_tex)
        yield ColorizationBatch(colorized_tex, entity_hues)
コード例 #10
0
def colorize_entities(
        tex: str,
        entities: Sequence[SerializableEntity],
        options: ColorizeOptions = ColorizeOptions(),
) -> ColorizedTex:
    """
    This function assumes that entities do not overlap. It is up to the caller to appropriately
    filter entities to those that do not overlap with each other.
    """

    insert_color_macros = options.insert_color_macros
    preset_hue = options.preset_hue
    adjust_color_positions = options.adjust_color_positions
    braces = options.braces

    # Filter entities to a list where no entity overlaps with any other entity. Those
    # that overlap will be returned as skipped entities.
    entities_filtered: List[SerializableEntity] = []
    skipped = []
    for entity in entities:
        if any([overlaps(entity, e) for e in entities_filtered]):
            skipped.append(entity)
            continue
        entities_filtered.append(entity)

    # Order entities from last-to-first so we can add color commands without messing with the offsets of
    # entities that haven't yet been colored.
    entities_reverse_order = sorted(entities_filtered,
                                    key=lambda e: e.start,
                                    reverse=True)

    hue_generator = generate_hues()
    entity_hues = {}

    colorized_tex = tex
    for e in entities_reverse_order:

        # Get a hue to color this entity
        if preset_hue is not None:
            hue = preset_hue
        else:
            hue = next(hue_generator)

        # Save a reference to this colorized entity to return to the caller
        entity_hues[e.id_] = hue

        # Determine what range of characters to color
        color_character_range = CharacterRange(e.start, e.end)
        if adjust_color_positions is not None:
            color_character_range = adjust_color_positions(e)
        colorized_tex = insert_color_in_tex(
            colorized_tex,
            e.id_,
            hue,
            color_character_range.start,
            color_character_range.end,
            braces=braces,
        )

    # Only insert color macros after all entities have been wrapped in color commands.
    # The color macros will likely go at the very beginning of the file, and therefore
    # if they are added before the color commands, they are likely to disrupt the character
    # positions at which we expect to find the entities.
    if insert_color_macros:
        colorized_tex = add_color_macros(colorized_tex)

    return ColorizedTex(colorized_tex, entity_hues, skipped=skipped)