def load(self) -> Iterator[SymbolSentencesTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentences-for-symbols", arxiv_id) file_utils.clean_directory(output_dir) token_sentences_path = os.path.join( directories.arxiv_subdir("sentences-for-equation-tokens", arxiv_id), "entity_sentences.csv", ) if not os.path.exists(token_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "Could not find links between sentences and equation tokens at " + "path %s for arXiv paper %s. Skipping the detection of symbol sentences.", token_sentences_path, arxiv_id, ) continue token_sentence_pairs = list( file_utils.load_from_csv(token_sentences_path, EntitySentencePairIds)) symbols = file_utils.load_symbols(arxiv_id) if not symbols: continue # Filter to only those symbols for which tokens have been detected symbols = [s for s in symbols if len(s.symbol.characters) > 0] yield SymbolSentencesTask(arxiv_id, symbols, token_sentence_pairs)
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir( "composite-symbols-locations", arxiv_id) file_utils.clean_directory(output_dir) token_locations = file_utils.load_equation_token_locations( arxiv_id) if token_locations is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: # Symbols with affixes (e.g., arrows, hats) cannot be localized by taking the union # of their tokens' bounding boxes, because the bounding boxes of affix tokens # cannot be detected on their own. if not symbol_with_id.symbol.contains_affix: yield LocationTask( arxiv_id=arxiv_id, token_locations=token_locations, symbol_with_id=symbol_with_id, )
def load(self) -> Iterator[MathMLForPaper]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("symbol-matches", arxiv_id) file_utils.clean_directory(output_dir) symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols_mathml = {swi.symbol.mathml for swi in symbols_with_ids} yield MathMLForPaper(arxiv_id=arxiv_id, mathml_equations=symbols_mathml)
def load(self) -> Iterator[TexAndSymbols]: for arxiv_id in self.arxiv_ids: output_root = directories.arxiv_subdir( "sources-with-annotated-symbols", arxiv_id) file_utils.clean_directory(output_root) symbols_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id) tokens_path = os.path.join(symbols_dir, "entities.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids} tokens = file_utils.load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({t.tex_path for t in tokens}) characters: Dict[CharacterId, Character] = {} for token in tokens: character_id = CharacterId(token.tex_path, token.equation_index, token.token_index) characters[character_id] = Character(token.text, token.token_index, token.start, token.end) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join( directories.arxiv_subdir("sources", arxiv_id), tex_path) file_contents = file_utils.read_file_tolerant( absolute_tex_path) if file_contents is not None: contents_by_file[tex_path] = file_contents yield TexAndSymbols(arxiv_id, contents_by_file, symbols, characters)
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("sentence-tokens", arxiv_id) file_utils.clean_directory(output_dir) # Load symbols, for use in embellishing equations. symbols: Dict[str, List[Symbol]] = defaultdict(list) symbol_data = file_utils.load_symbols(arxiv_id) if symbol_data is not None: for id_, symbol in symbol_data: symbols[id_.tex_path].append(symbol) else: logging.warning( # pylint: disable=logging-not-lazy "No symbol data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, arxiv_id, ) # Load sentences. detected_sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv", ) if not os.path.exists(detected_sentences_path): logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detecting sentences for this paper.", arxiv_id, ) continue sentences = file_utils.load_from_csv(detected_sentences_path, Sentence) for sentence in sentences: yield Task(arxiv_id, sentence, symbols[sentence.tex_path])
def load(self) -> Iterator[LocationTask]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("symbol-locations", arxiv_id) file_utils.clean_directory(output_dir) token_locations = file_utils.load_equation_token_locations( arxiv_id) if token_locations is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: yield LocationTask( arxiv_id=arxiv_id, token_locations=token_locations, symbol_with_id=symbol_with_id, )
def load(self) -> Iterator[Task]: for arxiv_id in self.arxiv_ids: output_dir = directories.arxiv_subdir("embellished-sentences", arxiv_id) file_utils.clean_directory(output_dir) # Load equation data. equations: Equations = {} equations_path = os.path.join( directories.arxiv_subdir("detected-equations", arxiv_id), "entities.csv") try: equation_data = file_utils.load_from_csv( equations_path, Equation) for equation in equation_data: equations[(equation.tex_path, int(equation.id_))] = equation except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No equation data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, ) # Load symbols, for use in embellishing equations. symbols: Symbols = defaultdict(list) symbol_data = file_utils.load_symbols(arxiv_id) if symbol_data is not None: for id_, symbol in symbol_data: symbols[(id_.tex_path, id_.equation_index)].append(symbol) else: logging.warning( # pylint: disable=logging-not-lazy "No symbol data found for arXiv ID %s. It will not be " + "possible to expand equations in sentences with symbol data. This should only " + "be a problem if it's expected that there are no symbols in paper %s.", arxiv_id, ) # Load sentences. detected_sentences_path = os.path.join( directories.arxiv_subdir("detected-sentences", arxiv_id), "entities.csv", ) try: sentences = file_utils.load_from_csv(detected_sentences_path, Sentence) except FileNotFoundError: logging.warning( # pylint: disable=logging-not-lazy "No sentences data found for arXiv paper %s. Try re-running the pipeline, " + "this time enabling the processing of sentences. If that doesn't work, " + "there is likely an error in detcting sentences for this paper.", arxiv_id, ) continue for sentence in sentences: yield Task(arxiv_id, sentence, equations, symbols)
def load(self) -> Iterator[SymbolData]: for arxiv_id in self.arxiv_ids: s2_id = get_s2_id(arxiv_id) if s2_id is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids} boxes: Dict[SymbolId, BoundingBox] = {} boxes_path = os.path.join( directories.arxiv_subdir("symbol-locations", arxiv_id), "symbol_locations.csv", ) if not os.path.exists(boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue for location in file_utils.load_from_csv(boxes_path, SymbolLocation): symbol_id = SymbolId( tex_path=location.tex_path, equation_index=location.equation_index, symbol_index=location.symbol_index, ) box = BoundingBox( page=int(location.page), left=location.left, top=location.top, width=location.width, height=location.height, ) boxes[symbol_id] = box matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", arxiv_id), "matches.csv") if not os.path.exists(matches_path): logging.warning( "Could not find symbol matches information for %s. Skipping", arxiv_id, ) continue for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) context_data_missing = False contexts_path = os.path.join( directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv", ) if not os.path.exists(contexts_path): logging.warning( # pylint: disable=logging-not-lazy "Contexts have not been found for symbols for arXiv paper %s. " + "Symbol data will be uploaded without contexts.", arxiv_id, ) context_data_missing = True symbol_contexts = {} mathml_contexts = defaultdict(list) if not context_data_missing: for context in file_utils.load_from_csv( contexts_path, Context): tex_path = context.tex_path equation_index, symbol_index = [ int(t) for t in context.entity_id.split("-") ] symbol_id = SymbolId(tex_path, equation_index, symbol_index) symbol_contexts[symbol_id] = context symbol = symbols_by_id[symbol_id] mathml_contexts[symbol.mathml].append(context) symbol_formulas = {} mathml_formulas = defaultdict(set) for id_, symbol in symbols_by_id.items(): if (symbol.is_definition and symbol.equation is not None and symbol.relative_start is not None and symbol.relative_end is not None): highlighted = wrap_span( symbol.equation, symbol.relative_start, symbol.relative_end, before=r"\htmlClass{match-highlight}{", after="}", braces=True, ) formula = DefiningFormula( tex=highlighted, tex_path=id_.tex_path, equation_id=id_.equation_index, ) symbol_formulas[id_] = formula mathml_formulas[symbol.mathml].add(formula) yield SymbolData( arxiv_id, s2_id, symbols_with_ids, boxes, symbol_contexts, symbol_formulas, mathml_contexts, mathml_formulas, matches, )
def load(self) -> Iterator[SymbolData]: for arxiv_id in self.arxiv_ids: s2_id = get_s2_id(arxiv_id) if s2_id is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue boxes: Dict[SymbolId, BoundingBox] = {} boxes_path = os.path.join( directories.arxiv_subdir("symbol-locations", arxiv_id), "symbol_locations.csv", ) if not os.path.exists(boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue for location in file_utils.load_from_csv(boxes_path, SymbolLocation): symbol_id = SymbolId( tex_path=location.tex_path, equation_index=location.equation_index, symbol_index=location.symbol_index, ) box = BoundingBox( page=int(location.page), left=location.left, top=location.top, width=location.width, height=location.height, ) boxes[symbol_id] = box matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", arxiv_id), "matches.csv") if not os.path.exists(matches_path): logging.warning( "Could not find symbol matches information for %s. Skipping", arxiv_id, ) continue for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) sentence_data_missing = False sentences_path = os.path.join( directories.arxiv_subdir("sentences-for-symbols", arxiv_id), "entity_sentences.csv", ) if not os.path.exists(sentences_path): logging.warning( # pylint: disable=logging-not-lazy "Symbols for arXiv paper %s have not been aligned to sentences. " + "Symbol data will be uploaded without links to sentences", arxiv_id, ) sentence_data_missing = True if not sentence_data_missing: symbol_sentences = {} for pair in file_utils.load_from_csv(sentences_path, EntitySentencePairIds): tex_path = pair.tex_path equation_index, symbol_index = [ int(t) for t in pair.entity_id.split("-") ] sentence_key = SentenceKey(pair.tex_path, pair.sentence_id) symbol_id = SymbolId(tex_path, equation_index, symbol_index) symbol_sentences[symbol_id] = sentence_key yield SymbolData( arxiv_id, s2_id, symbols_with_ids, boxes, symbol_sentences, matches, )
def upload_symbol_definitions( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: " Upload symbols and their definitions. " # Associate definitions with symbols as follows: # Definitions will be associated with entire equations as per the current implementation # of the definition detector. Conservatively, associate a definition for an equation # with a single symbol only if that symbol is the *only* top-level symbol in that equation. # Load symbols from files. Group symbols by equation to make it easy to detect whether a # symbol is the only top-level symbol in the equation. symbols_by_equation: Dict[ Tuple[TexPath, EquationIndex], List[Symbol] ] = defaultdict(list) symbols: List[Symbol] = [] symbols_with_ids = file_utils.load_symbols(processing_summary.arxiv_id) if symbols_with_ids is None: logging.info( # pylint: disable=logging-not-lazy "No symbols were loaded for paper %s. Therefore, no definitions for symbols " + "will be uploaded for this paper.", processing_summary.arxiv_id, ) return for _, symbol in symbols_with_ids: symbols_by_equation[symbol.tex_path, symbol.equation_index].append(symbol) symbols.append(symbol) # Group symbols by their MathML. These groups will be used to propagate definitions from # one defined symbol to all other appearances of that symbol. symbols_by_mathml: Dict[MathML, List[Symbol]] = defaultdict(list) for symbol in symbols: symbols_by_mathml[symbol.mathml].append(symbol) # Construct map from definitions to the sentences that contain them. contexts_by_definition: Dict[EntityId, Context] = {} for entity_summary in processing_summary.entities: entity_id = entity_summary.entity.id_ context = entity_summary.context if (entity_id.startswith("definition")) and context is not None: contexts_by_definition[entity_id] = context # Fetch rows for all entities for this paper that have already been uploaded to the database. # This allows lookup of the row IDs for the sentence that contain definitions of symbols. entity_models = fetch_entity_models(processing_summary.s2_id, data_version) # Create a list of rows to insert into the database containing definition data. entity_data_models: List[EntityDataModel] = [] for entity_summary in processing_summary.entities: entity = entity_summary.entity if not entity.id_.startswith("definiendum"): continue # Attempt to match definienda (defined terms) to symbols that are being defined. definiendum = cast(Definiendum, entity) defined_symbol = None for symbol in symbols: # Is the definiendum an equation? if definiendum.type_ != "symbol": continue # Does the symbol fall within in the range of characters being defined? if symbol.start < definiendum.start or symbol.end > definiendum.end: continue # Is the symbol a top-level symbol? if symbol.parent is not None: continue # Is it the *only* top-level symbol in its equation? top_level_symbols_in_equation = filter( lambda s: s.parent is not None, symbols_by_equation[(symbol.tex_path, symbol.equation_index)], ) if len(list(top_level_symbols_in_equation)) > 1: continue defined_symbol = symbol logging.debug( # pylint: disable=logging-not-lazy "Matched definiedum %s at position (%d, %d) to symbol %s at position " + "(%s, %s) for paper %s. A definition for this symbol will be uploaded.", definiendum.tex, definiendum.start, definiendum.end, symbol.tex, symbol.start, symbol.end, processing_summary.arxiv_id, ) break if defined_symbol is None: continue # Assemble data about definitions for the symbol. definitions = definiendum.definitions definition_texs = definiendum.definition_texs sources = definiendum.sources definition_sentence_ids: List[Optional[str]] = [] for definition_id in definiendum.definition_ids: context = contexts_by_definition.get(definition_id) if context is None: definition_sentence_ids.append(None) else: definition_sentence_ids.append( f"{context.tex_path}-{context.sentence_id}" ) # Find all symbols that are the same (i.e., that have the same MathML representation). # Then save definition data so that it applies all of those symbols. matching_symbols = symbols_by_mathml.get(defined_symbol.mathml) if matching_symbols is not None: for s in matching_symbols: entity_model = entity_models.get(("symbol", sid(s))) data: EntityData = { "definitions": definitions, "definition_texs": definition_texs, "sources": sources, } entity_data_models.extend(make_data_models(None, entity_model, data)) relationships: EntityRelationships = { "definition_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in definition_sentence_ids ], } entity_data_models.extend( make_relationship_models( ("symbol", sid(s)), relationships, entity_models ) ) with output_database.atomic(): EntityDataModel.bulk_create(entity_data_models, 200)