adjust_color_positions=adjust_color_positions, braces=True, when=filter_symbols_with_affixes, group=divide_symbols_into_nonoverlapping_groups, ), ), LocateCompositeSymbols, CollectSymbolLocations, make_upload_entities_command("symbols", upload_symbols, DetectedEntityType=SerializableSymbol), ] def make_digest(_: str, arxiv_id: ArxivId) -> EntityProcessingDigest: """ Custom digest creator. Count the equation tokens, instead of the 'symbols', as we can use the default entity counters for the outputs of equation token commands. """ return make_default_paper_digest("equation-tokens", arxiv_id) symbols_pipeline = EntityPipeline( "symbols", commands, depends_on=["equations"], optional_depends_on=["sentences"], make_digest=make_digest, ) register_entity_pipeline(symbols_pipeline)
from common import directories from common.colorize_tex import ColorizeOptions from common.commands.base import CommandList from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import adjust_color_positions from .extractor import SentenceExtractor from .types import Sentence from .upload import upload_sentences commands = create_entity_localization_command_sequence( "sentences", SentenceExtractor, DetectedEntityType=Sentence, colorize_options=ColorizeOptions( adjust_color_positions=adjust_color_positions), upload_func=upload_sentences, ) sentences_pipeline = EntityPipeline("sentences", commands) register_entity_pipeline(sentences_pipeline)
directories.register("detected-definitions") directories.register("sources-with-colorized-definitions") directories.register("compiled-sources-with-colorized-definitions") directories.register("paper-images-with-colorized-definitions") directories.register("diffed-images-with-colorized-definitions") directories.register("definitions-locations") upload_command = make_upload_entities_command( "definitions", upload_definitions, DetectedEntityType={ "entities-definiendums.csv": Definiendum, "entities-definitions.csv": Definition, "entities-term-references.csv": TermReference, }, ) commands: CommandList = [ EmbellishSentences, DetectDefinitions, make_locate_entities_command("definitions"), upload_command, ] definitions_pipeline = EntityPipeline( "definitions", commands, depends_on=["symbols", "sentences"], ) register_entity_pipeline(definitions_pipeline)
from .commands.locate_citations import LocateCitations from .commands.resolve_bibitems import ResolveBibitems from .commands.upload_citations import UploadCitations from .make_digest import make_digest from .types import Bibitem directories.register("detected-citations") directories.register("bibitem-resolutions") directories.register("sources-with-colorized-citations") directories.register("compiled-sources-with-colorized-citations") directories.register("paper-images-with-colorized-citations") directories.register("diffed-images-with-colorized-citations") directories.register("citations-locations") directories.register("citation-cluster-locations") directories.register("sources-with-annotated-symbols") commands: CommandList = [ ExtractBibitems, ResolveBibitems, make_locate_entities_command("citations", DetectedEntityType=Bibitem, colorize_func=colorize_citations), LocateCitations, UploadCitations, ] citations_pipeline = EntityPipeline("citations", commands, make_digest=make_digest) register_entity_pipeline(citations_pipeline)
from .colorize import adjust_color_positions from .extractor import GlossaryTermExtractor from .upload import upload_terms commands = create_entity_localization_command_sequence( "glossary-terms", GlossaryTermExtractor, DetectedEntityType=Term, colorize_options=ColorizeOptions(adjust_color_positions=adjust_color_positions), upload_func=upload_terms, ) # Before uploading entities, extract contexts that each term appeared in. upload_command_index = len(commands) for i, command in enumerate(commands): if command.get_name() == "upload-glossary-terms": upload_command_index = i directories.register("contexts-for-glossary-terms") commands.insert( upload_command_index, make_extract_contexts_command( "glossary-terms", EntityType=Term, tex_wrapper=TexWrapper(before="**", after="**"), ), ) terms_pipeline = EntityPipeline("glossary-terms", commands) register_entity_pipeline(terms_pipeline)
from common.types import ColorizeOptions, Term from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import adjust_color_positions from .extractor import GlossaryTermExtractor from .upload import upload_terms commands = create_entity_localization_command_sequence( "glossary-terms", GlossaryTermExtractor, extract_contexts=True, DetectedEntityType=Term, colorize_options=ColorizeOptions( adjust_color_positions=adjust_color_positions), upload_func=upload_terms, ) terms_pipeline = EntityPipeline("glossary-terms", commands, depends_on=["sentences"]) register_entity_pipeline(terms_pipeline)
from common import directories from common.types import SerializableEntity from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import get_term_color_positions from .extractor import TermExtractor from .types import Term from .upload import upload_terms commands = create_entity_localization_command_sequence( "terms", TermExtractor, DetectedEntityType=Term, get_color_positions=get_term_color_positions, upload_func=upload_terms, ) terms_pipeline = EntityPipeline("terms", commands) register_entity_pipeline(terms_pipeline)
from common import directories from common.commands.base import CommandList from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .colorize import get_sentence_color_positions from .extractor import SentenceExtractor from .types import Sentence from .upload import Sentence as SentenceModel from .upload import upload_sentences commands = create_entity_localization_command_sequence( "sentences", SentenceExtractor, DetectedEntityType=Sentence, get_color_positions=get_sentence_color_positions, upload_func=upload_sentences, ) # Register additional directories to be used by the upload function directories.register("sentences-model-ids") sentences_pipeline = EntityPipeline("sentences", commands, database_models=[SentenceModel]) register_entity_pipeline(sentences_pipeline)
from common.parse_tex import EquationExtractor from common.types import CharacterRange, ColorizeOptions, Equation, SerializableEntity from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .upload import upload_equations def colorize_equation_when(entity: SerializableEntity) -> bool: equation = cast(Equation, entity) return equation.depth == 0 def adjust_color_positions(entity: SerializableEntity) -> CharacterRange: equation = cast(Equation, entity) return CharacterRange(equation.content_start, equation.content_end) commands = create_entity_localization_command_sequence( "equations", EquationExtractor, DetectedEntityType=Equation, colorize_options=ColorizeOptions( when=colorize_equation_when, adjust_color_positions=adjust_color_positions), upload_func=upload_equations, ) equations_pipeline = EntityPipeline("equations", commands) register_entity_pipeline(equations_pipeline)
FindSymbolMatches, make_find_entity_sentences_command("equation-tokens"), FindSymbolSentences, ColorizeEquationTokens, make_compile_tex_command("equation-tokens"), make_raster_pages_command("equation-tokens"), make_diff_images_command("equation-tokens"), make_locate_hues_command("equation-tokens"), LocateSymbols, UploadSymbols, ] def make_digest(_: str, arxiv_id: ArxivId) -> EntityProcessingDigest: """ Custom digest creator. Count the equation tokens, instead of the 'symbols', as we can use the default entity counters for the outputs of equation token commands. """ return make_default_paper_digest("equation-tokens", arxiv_id) symbols_pipeline = EntityPipeline( "symbols", commands, depends_on=["equations"], optional_depends_on=["sentences"], database_models=[MathMl, MathMlMatch, Symbol, SymbolChild, SymbolSentence], make_digest=make_digest, ) register_entity_pipeline(symbols_pipeline)
from common import directories from common.commands.base import CommandList from entities.common import create_entity_localization_command_sequence from scripts.pipelines import EntityPipeline, register_entity_pipeline from .extractor import AbbreviationExtractor from .types import Abbreviation commands = create_entity_localization_command_sequence( "abbreviations", AbbreviationExtractor, DetectedEntityType=Abbreviation) abbreviations_pipeline = EntityPipeline("abbreviations", commands) register_entity_pipeline(abbreviations_pipeline)