Ejemplo n.º 1
0
def annotate_symbols_and_equations_for_file(
        tex: str, tex_path: RelativePath, symbols: SymbolDict,
        characters: CharacterDict) -> Tuple[str, Set[str]]:

    # Extract all equations
    equation_extractor = EquationExtractor()
    equations = list(equation_extractor.parse(tex_path, tex))

    # Group symbols by equation ID
    symbols_by_equation_id = _group_by_equation(symbols)

    # Create a list of annotations
    annotations: List[Annotation] = []
    symbol_tex: Set[str] = set()
    for equation in equations:
        equation_id = EquationId(tex_path, equation.i)
        equation_symbols = symbols_by_equation_id.get(equation_id, [])
        equation_annotations = _create_annotations_for_equation(
            tex, equation, equation_id, equation_symbols, characters)
        annotations.extend(equation_annotations.annotations)
        symbol_tex.update(equation_annotations.symbol_tex)

    # Annotate the TeX
    annotated_tex = tex
    annotations_reverse_order = sorted(annotations,
                                       key=lambda a: a.position,
                                       reverse=True)
    for annotation in annotations_reverse_order:
        position = annotation.position
        annotated_tex = (annotated_tex[:position] + annotation.text +
                         annotated_tex[position:])

    return annotated_tex, symbol_tex
Ejemplo n.º 2
0
def _group_by_equation(symbols: SymbolDict) -> SymbolsByEquationId:
    symbols_by_equation_id: SymbolsByEquationId = {}
    for symbol_id, symbol in symbols.items():
        equation_id = EquationId(symbol_id.tex_path, symbol_id.equation_index)
        if equation_id not in symbols_by_equation_id:
            symbols_by_equation_id[equation_id] = []
        symbols_by_equation_id[equation_id].append(symbol)
    return symbols_by_equation_id
Ejemplo n.º 3
0
def load_equations(arxiv_id: ArxivId) -> Optional[Dict[EquationId, Equation]]:
    equations_path = os.path.join(
        directories.arxiv_subdir("detected-equations", arxiv_id), "entities.csv"
    )
    if not os.path.exists(equations_path):
        logging.warning("No equation data found for paper %s. Skipping.", arxiv_id)
        return None

    equations: Dict[EquationId, Equation] = {}
    for e in load_from_csv(equations_path, Equation):
        equation_id = EquationId(tex_path=e.tex_path, equation_index=int(e.i))
        equations[equation_id] = e
    return equations
Ejemplo n.º 4
0
def colorize_equation_tokens(
    file_contents: Dict[TexFileName, FileContents],
    tokens: List[SerializableToken],
    insert_color_macros: bool = True,
    preset_hue: Optional[float] = None,
) -> Iterator[TokenColorizationBatch]:

    equations_by_file: Dict[TexFileName, Set[EquationId]] = {}
    tokens_by_equation: Dict[EquationId, List[SerializableToken]] = {}

    for token in tokens:
        equation_id = EquationId(token.tex_path, token.equation_index)
        if equation_id not in tokens_by_equation:
            tokens_by_equation[equation_id] = []

        # Only color tokens that aren't in nested equations (i.e. equations contained in other
        # equations). While coloring commands can technically be used multiple times on the same
        # token with the same visual outcome, processing nested equations will break the expected
        # positions of the tokens on the second coloring pass.
        if token.equation_depth == 0:
            tokens_by_equation[equation_id].append(token)

        if not token.tex_path in equations_by_file:
            equations_by_file[token.tex_path] = set()
        equations_by_file[token.tex_path].add(
            EquationId(token.tex_path, token.equation_index))

    # Number of tokens to skip when coloring. Starts at 0, and increases with each pass of
    # coloring. Multiple passes will be needed as the distinct hues for tokens runs out fast.
    # Tokens are colored in parallel for all equations from all TeX files, as the search for
    # colors will be done within the bounding boxes detected for each equation independently.
    token_skip = 0

    more_batches = True
    while more_batches:

        colorized_files: Dict[TexFileName, FileContents] = {}
        colorized_tokens = []

        for tex_filename, tex_file_contents in file_contents.items():
            if not tex_filename in equations_by_file:
                continue

            colorized_tex = tex_file_contents.contents

            equations_for_file = equations_by_file[tex_filename]
            equations_reverse_order = sorted(
                equations_for_file,
                key=lambda e: e.equation_index,
                reverse=True,
            )
            for equation_id in equations_reverse_order:
                equation_tokens = tokens_by_equation.get(equation_id)
                if equation_tokens is not None:
                    (
                        colorized_tex,
                        colorized_tokens_for_equation,
                    ) = _colorize_tokens_for_equation(colorized_tex,
                                                      equation_tokens,
                                                      token_skip, preset_hue)
                    colorized_tokens.extend(colorized_tokens_for_equation)

            # Only insert color macros after all entities have been wrapped in color commands.
            # The color macros will likely go at the very beginning of the file, and therefore
            # if they are added before the color commands, they are likely to disrupt the character
            # positions at which we expect to find the entities.
            if insert_color_macros:
                colorized_tex = add_color_macros(colorized_tex)
            colorized_files[tex_filename] = FileContents(
                tex_file_contents.path, colorized_tex,
                tex_file_contents.encoding)

        # If some tokens were colorized...
        if len(colorized_tokens) > 0:

            # Return batch of colorized tokens and colorized TeX
            yield TokenColorizationBatch(colorized_files, colorized_tokens)
            colorized_tokens = []
            colorized_files = {}

            # Continue coloring, starting from another set of tokens
            more_batches = True
            token_skip += NUM_HUES

        else:
            more_batches = False
Ejemplo n.º 5
0
    def load_hues(self, arxiv_id: ArxivId,
                  iteration: str) -> List[HueSearchRegion]:

        equation_boxes_path = os.path.join(
            directories.arxiv_subdir("hue-locations-for-equations", arxiv_id),
            "hue_locations.csv",
        )
        bounding_boxes: Dict[EquationId, BoundingBoxesByFile] = {}

        for location_info in file_utils.load_from_csv(equation_boxes_path,
                                                      HueLocationInfo):
            equation_id = EquationId(
                tex_path=location_info.tex_path,
                equation_index=int(location_info.entity_id),
            )
            if equation_id not in bounding_boxes:
                bounding_boxes[equation_id] = {}

            file_path = location_info.relative_file_path
            if file_path not in bounding_boxes[equation_id]:
                bounding_boxes[equation_id][file_path] = []

            box = BoundingBox(
                page=location_info.page,
                left=location_info.left,
                top=location_info.top,
                width=location_info.width,
                height=location_info.height,
            )
            bounding_boxes[equation_id][file_path].append(box)

        token_records_by_equation: Dict[EquationId, Dict[
            int, EquationTokenColorizationRecord]] = {}
        token_hues_path = os.path.join(
            directories.iteration(
                "sources-with-colorized-equation-tokens",
                arxiv_id,
                iteration,
            ),
            "entity_hues.csv",
        )
        for record in file_utils.load_from_csv(
                token_hues_path, EquationTokenColorizationRecord):
            equation_id = EquationId(tex_path=record.tex_path,
                                     equation_index=record.equation_index)
            token_index = int(record.token_index)

            if equation_id not in token_records_by_equation:
                token_records_by_equation[equation_id] = {}
            token_records_by_equation[equation_id][token_index] = record

        hue_searches = []
        for equation_id, boxes_by_file in bounding_boxes.items():
            for file_path, boxes in boxes_by_file.items():
                masks_by_page: MasksForPages = {}
                for box in boxes:
                    if box.page not in masks_by_page:
                        masks_by_page[box.page] = []
                    masks_by_page[box.page].append(
                        Rectangle(box.left, box.top, box.width, box.height))

                if equation_id in token_records_by_equation:
                    for token_index, record in token_records_by_equation[
                            equation_id].items():
                        hue_searches.append(
                            HueSearchRegion(
                                hue=record.hue,
                                record=record,
                                relative_file_path=file_path,
                                masks=masks_by_page,
                            ))

        return hue_searches