Esempio n. 1
0
def colorize_citations(
    tex: str,
    bibitems: List[SerializableEntity],
    options: ColorizeOptions = ColorizeOptions()
) -> ColorizedTex:
    """
    To save time, this function only attempts to add colorization commands to the main document file,
    as determined by the presence of the "documentclass" macro. This function will do nothing when
    applied to plain TeX (i.e. non-LaTeX) files.
    """

    documentclass_extractor = DocumentclassExtractor()
    documentclass = documentclass_extractor.parse(tex)
    if not documentclass:
        return ColorizedTex(tex, {})

    citation_color_commands_tex = ""
    citation_hues = {}
    hue_generator = generate_hues()
    for bibitem in bibitems:
        if options.preset_hue is not None:
            hue = options.preset_hue
        else:
            hue = next(hue_generator)

        citation_color_commands_tex += _get_color_citation_tex(
            bibitem.id_, hue) + "\n"
        citation_hues[bibitem.id_] = hue

    colorized_tex = add_color_macros(tex,
                                     after_macros=citation_color_commands_tex)
    return ColorizedTex(colorized_tex, citation_hues)
Esempio n. 2
0
def make_locate_entities_command(
    entity_name: str,
    DetectedEntityType: Optional[Type[SerializableEntity]] = None,
    colorize_options: ColorizeOptions = ColorizeOptions(),
    colorize_func: Optional[ColorizeFunc] = None,
    sanity_check_images: Optional[bool] = None,
) -> Type[LocateEntitiesCommand]:
    """
    Create a command for locating the bounding boxes for entities. Help the command cast
    the entities loaded into the right data type by providing a 'DetectedEntityType'.
    Colorization of entities can be customized, either by providing a unique 'colorize_func',
    or by providing a set of 'colorize_options'. Specify 'sanity_check_images' to force
    visual validation of image differences. Bounding boxes will be omitted for entities
    when unexpected visual artifacts are found in image differences.
    """
    class C(LocateEntitiesCommand):
        @staticmethod
        def get_name() -> str:
            return f"locate-bounding-boxes-for-{entity_name}"

        @staticmethod
        def get_description() -> str:
            return f"Find bounding boxes of {entity_name}."

        @staticmethod
        def get_entity_name() -> str:
            return entity_name

        @staticmethod
        def get_detected_entity_type() -> Type[SerializableEntity]:
            if DetectedEntityType is None:
                return super(C, C).get_detected_entity_type()
            return DetectedEntityType

        @staticmethod
        def get_colorize_options() -> ColorizeOptions:
            return colorize_options

        @staticmethod
        def get_colorize_func() -> Optional[ColorizeFunc]:
            return colorize_func

        @staticmethod
        def should_sanity_check_images() -> Optional[bool]:
            return sanity_check_images

    return C
Esempio n. 3
0
def create_entity_localization_command_sequence(
    entity_name: str,
    EntityExtractorType: Type[EntityExtractor],
    DetectedEntityType: Optional[Type[SerializableEntity]] = None,
    upload_func: Optional[EntityUploadCallable] = None,
    colorize_options: ColorizeOptions = ColorizeOptions(),
    colorize_func: Optional[ColorizeFunc] = None,
) -> List[Type[Command]]:  # type: ignore
    """
    Create a set of commands that can be used to locate a new type of entity. In the simplest case,
    all you have to provide is and 'entity_name' to be used for naming output files, and
    'entity_type' that can be used to filter which commands are being run when you the full
    pipeline is run, and an 'EntityExtractorType' that locates all instances of that entity in the
    TeX. This function creates the commands necessary to colorize the entities, compile the
    LaTeX, raster the pages, and locate the colors in the pages. You may define additional
    paramters (e.g., 'colorize_options') to fine-tune the commands.

    If you are trying to find the locations of a new type of entity, it is highly recommended that
    you use this convenience methods instead of creating new commands yourself.
    """

    # Register directories for output from intermediate pipeline stages.
    directories.register(f"detected-{entity_name}")
    directories.register(f"sources-with-colorized-{entity_name}")
    directories.register(f"compiled-sources-with-colorized-{entity_name}")
    directories.register(f"paper-images-with-colorized-{entity_name}")
    directories.register(f"diffed-images-with-colorized-{entity_name}")
    directories.register(f"{entity_name}-locations")

    commands: CommandList = [
        make_detect_entities_command(entity_name, EntityExtractorType),
        make_locate_entities_command(entity_name, DetectedEntityType,
                                     colorize_options, colorize_func),
    ]

    if upload_func is not None:
        upload_command = make_upload_entities_command(
            entity_name, upload_func, DetectedEntityType=DetectedEntityType)
        commands.append(upload_command)

    return commands
Esempio n. 4
0
from common import directories
from common.colorize_tex import ColorizeOptions
from common.commands.base import CommandList
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .colorize import adjust_color_positions
from .extractor import SentenceExtractor
from .types import Sentence
from .upload import upload_sentences

commands = create_entity_localization_command_sequence(
    "sentences",
    SentenceExtractor,
    DetectedEntityType=Sentence,
    colorize_options=ColorizeOptions(
        adjust_color_positions=adjust_color_positions),
    upload_func=upload_sentences,
)

sentences_pipeline = EntityPipeline("sentences", commands)
register_entity_pipeline(sentences_pipeline)
Esempio n. 5
0
commands = [
    ExtractSymbols,
    FindSymbolMatches,
    make_extract_contexts_command(
        "symbols",
        EntityType=SerializableSymbol,
        entity_key=entity_key_for_contexts,
        tex_wrapper=TexWrapper(before=r"\htmlClass{match-highlight}{",
                               after="}",
                               braces=True),
    ),
    make_locate_entities_command(
        "equation-tokens",
        DetectedEntityType=SerializableToken,
        colorize_options=ColorizeOptions(
            adjust_color_positions=adjust_color_positions, braces=True),
    ),
    LocateSymbols,
    UploadSymbols,
]


def make_digest(_: str, arxiv_id: ArxivId) -> EntityProcessingDigest:
    """
    Custom digest creator. Count the equation tokens, instead of the 'symbols', as we can
    use the default entity counters for the outputs of equation token commands.
    """
    return make_default_paper_digest("equation-tokens", arxiv_id)


symbols_pipeline = EntityPipeline(
Esempio n. 6
0
from common.parse_tex import EquationExtractor
from common.types import CharacterRange, Equation, SerializableEntity
from entities.common import create_entity_localization_command_sequence
from scripts.pipelines import EntityPipeline, register_entity_pipeline

from .upload import upload_equations


def colorize_equation_when(entity: SerializableEntity) -> bool:
    equation = cast(Equation, entity)
    return equation.depth == 0


def adjust_color_positions(entity: SerializableEntity) -> CharacterRange:
    equation = cast(Equation, entity)
    return CharacterRange(equation.content_start, equation.content_end)


commands = create_entity_localization_command_sequence(
    "equations",
    EquationExtractor,
    Equation,
    colorize_options=ColorizeOptions(
        when=colorize_equation_when,
        adjust_color_positions=adjust_color_positions),
    upload_func=upload_equations,
)

equations_pipeline = EntityPipeline("equations", commands)
register_entity_pipeline(equations_pipeline)