Esempio n. 1
0
def create_entity_localization_command_sequence(
    entity_name: str,
    EntityExtractorType: Type[EntityExtractor],
    extract_contexts: bool = False,
    DetectedEntityType: Optional[Type[SerializableEntity]] = None,
    upload_func: Optional[EntityUploadCallable] = None,
    colorize_options: ColorizeOptions = ColorizeOptions(),
    colorize_func: Optional[ColorizeFunc] = None,
) -> List[Type[Command]]:  # type: ignore
    """
    Create a set of commands that can be used to locate a new type of entity. In the simplest case,
    all you have to provide is and 'entity_name' to be used for naming output files, and
    'entity_type' that can be used to filter which commands are being run when you the full
    pipeline is run, and an 'EntityExtractorType' that locates all instances of that entity in the
    TeX. This function creates the commands necessary to colorize the entities, compile the
    LaTeX, raster the pages, and locate the colors in the pages. You may define additional
    paramters (e.g., 'colorize_options') to fine-tune the commands.

    To extract the contexts for an entity (i.e., the sentences in which the entities appear),
    set 'extract_contexts' to True.

    If you are trying to find the locations of a new type of entity, it is highly recommended that
    you use this convenience methods instead of creating new commands yourself.
    """

    commands: CommandList = []

    directories.register(f"detected-{entity_name}")
    commands.append(
        make_detect_entities_command(entity_name, EntityExtractorType))

    if extract_contexts:
        directories.register(f"contexts-for-{entity_name}")
        commands.append(make_extract_contexts_command(entity_name))

    directories.register(f"sources-with-colorized-{entity_name}")
    directories.register(f"compiled-sources-with-colorized-{entity_name}")
    directories.register(f"paper-images-with-colorized-{entity_name}")
    directories.register(f"diffed-images-with-colorized-{entity_name}")
    directories.register(f"{entity_name}-locations")
    commands.append(
        make_locate_entities_command(entity_name, None, DetectedEntityType,
                                     colorize_options, colorize_func))

    if upload_func is not None:
        upload_command = make_upload_entities_command(
            entity_name, upload_func, DetectedEntityType=DetectedEntityType)
        commands.append(upload_command)

    return commands
Esempio n. 2
0
commands = [
    ExtractSymbols,
    FindSymbolMatches,
    make_extract_contexts_command(
        "symbols",
        EntityType=SerializableSymbol,
        entity_key=entity_key_for_contexts,
        tex_wrapper=TexWrapper(before=r"\htmlClass{match-highlight}{",
                               after="}",
                               braces=True),
    ),
    make_locate_entities_command(
        "equation-tokens",
        DetectedEntityType=SerializableToken,
        colorize_options=ColorizeOptions(
            adjust_color_positions=adjust_color_positions,
            braces=True,
            when=filter_atom_tokens,
        ),
    ),
    make_locate_entities_command(
        "symbols-with-affixes",
        input_entity_name="symbols",
        DetectedEntityType=SerializableSymbol,
        colorize_options=ColorizeOptions(
            adjust_color_positions=adjust_color_positions,
            braces=True,
            when=filter_symbols_with_affixes,
            group=divide_symbols_into_nonoverlapping_groups,
        ),
    ),
Esempio n. 3
0
from .commands.locate_citations import LocateCitations
from .commands.resolve_bibitems import ResolveBibitems
from .commands.upload_citations import UploadCitations
from .make_digest import make_digest
from .types import Bibitem

directories.register("detected-citations")
directories.register("bibitem-resolutions")
directories.register("sources-with-colorized-citations")
directories.register("compiled-sources-with-colorized-citations")
directories.register("paper-images-with-colorized-citations")
directories.register("diffed-images-with-colorized-citations")
directories.register("citations-locations")
directories.register("citation-cluster-locations")
directories.register("sources-with-annotated-symbols")

commands: CommandList = [
    ExtractBibitems,
    ResolveBibitems,
    make_locate_entities_command("citations",
                                 DetectedEntityType=Bibitem,
                                 colorize_func=colorize_citations),
    LocateCitations,
    UploadCitations,
]

citations_pipeline = EntityPipeline("citations",
                                    commands,
                                    make_digest=make_digest)
register_entity_pipeline(citations_pipeline)
Esempio n. 4
0
directories.register("detected-definitions")
directories.register("sources-with-colorized-definitions")
directories.register("compiled-sources-with-colorized-definitions")
directories.register("paper-images-with-colorized-definitions")
directories.register("diffed-images-with-colorized-definitions")
directories.register("definitions-locations")

upload_command = make_upload_entities_command(
    "definitions",
    upload_definitions,
    DetectedEntityType={
        "entities-definiendums.csv": Definiendum,
        "entities-definitions.csv": Definition,
        "entities-term-references.csv": TermReference,
    },
)

commands: CommandList = [
    EmbellishSentences,
    DetectDefinitions,
    make_locate_entities_command("definitions"),
    upload_command,
]

definitions_pipeline = EntityPipeline(
    "definitions",
    commands,
    depends_on=["symbols", "sentences"],
)
register_entity_pipeline(definitions_pipeline)
Esempio n. 5
0
    type_: str


def exclude_symbols(entity: SerializableEntity) -> bool:
    if entity.id_.startswith("definiendum") or entity.id_.startswith("term"):
        return cast(EntityWithType, entity).type_ != "symbol"
    return True


commands: CommandList = [
    TokenizeSentences,
    CreateAnnotationFiles,
    DetectDefinitions,
    make_extract_contexts_command(entity_name="definitions"),
    make_locate_entities_command(
        "definitions",
        DetectedEntityType=EntityWithType,
        # Do not locate terms that are symbols because these will already be detect more
        # robustly in dedicated commands for symbol localization.
        colorize_options=ColorizeOptions(when=exclude_symbols),
    ),
    upload_command,
]

definitions_pipeline = EntityPipeline(
    "definitions",
    commands,
    depends_on=["symbols", "sentences"],
)
register_entity_pipeline(definitions_pipeline)
Esempio n. 6
0
from .commands.upload_citations import UploadCitations
from .make_digest import make_digest
from .types import Bibitem

directories.register("detected-citations")
directories.register("bibitem-resolutions")
directories.register("sources-with-colorized-citation-fragments")
directories.register("compiled-sources-with-colorized-citation-fragments")
directories.register("paper-images-with-colorized-citation-fragments")
directories.register("diffed-images-with-colorized-citation-fragments")
directories.register("citation-fragments-locations")
directories.register("citations-locations")
directories.register("sources-with-annotated-symbols")


commands: CommandList = [
    ExtractBibitems,
    ResolveBibitems,
    make_locate_entities_command(
        entity_name="citation-fragments",
        input_entity_name="citations",
        DetectedEntityType=Bibitem,
        colorize_func=colorize_citations,
    ),
    LocateCitations,
    UploadCitations,
]

citations_pipeline = EntityPipeline("citations", commands, make_digest=make_digest)
register_entity_pipeline(citations_pipeline)
Esempio n. 7
0

commands = [
    ExtractSymbols,
    FindSymbolMatches,
    make_extract_contexts_command(
        "symbols",
        EntityType=SerializableSymbol,
        entity_key=entity_key_for_contexts,
        tex_wrapper=TexWrapper(before=r"\htmlClass{match-highlight}{",
                               after="}",
                               braces=True),
    ),
    make_locate_entities_command(
        "equation-tokens",
        DetectedEntityType=SerializableToken,
        colorize_options=ColorizeOptions(
            adjust_color_positions=adjust_color_positions, braces=True),
    ),
    LocateSymbols,
    UploadSymbols,
]


def make_digest(_: str, arxiv_id: ArxivId) -> EntityProcessingDigest:
    """
    Custom digest creator. Count the equation tokens, instead of the 'symbols', as we can
    use the default entity counters for the outputs of equation token commands.
    """
    return make_default_paper_digest("equation-tokens", arxiv_id)