def save(self, item: SymbolData, _: None) -> None: symbols_with_ids = item.symbols_with_ids boxes = item.boxes matches = item.matches symbol_contexts = item.symbol_contexts mathml_contexts = item.mathml_contexts symbol_formulas = item.symbol_formulas mathml_formulas = item.mathml_formulas symbol_ids_by_symbol_object_ids = {} for symbol_with_id in symbols_with_ids: symbol_ids_by_symbol_object_ids[id( symbol_with_id.symbol)] = symbol_with_id.symbol_id entity_infos = [] for symbol_with_id in symbols_with_ids: symbol = symbol_with_id.symbol # TODO(andrewhead): move this filtering condition into 'parse_equation' if symbol.tex in ["$|$", "|"]: continue symbol_id = symbol_with_id.symbol_id # Get context and formula of the symbol, and other matching ones. context = symbol_contexts.get(symbol_id) matching_contexts = mathml_contexts.get(symbol.mathml, []) other_context_texs = [] other_context_sentence_ids = [] for c in matching_contexts: matching_sentence_id = f"{c.tex_path}-{c.sentence_id}" if (matching_sentence_id not in other_context_sentence_ids # and c.sentence_id != context.sentence_id ): other_context_texs.append(c.snippet) other_context_sentence_ids.append(matching_sentence_id) formula = symbol_formulas.get(symbol_id) matching_formulas = mathml_formulas.get(symbol.mathml, []) other_formula_texs = [] other_formula_ids = [] for f in matching_formulas: equation_id = f"{f.tex_path}-{f.equation_id}" if equation_id not in other_formula_ids: # and ( # : formula is None or equation_id != formula.equation_id # ) other_formula_texs.append(f.tex) other_formula_ids.append(equation_id) box = boxes.get(symbol_id) if box is None: continue data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], # "snippet": context.snippet, "snippets": other_context_texs, "defining_formulas": other_formula_texs, "is_definition": symbol.is_definition or False, } # if formula is not None: # data['formula'] = formula.tex create_symbol_id_string: Callable[[SymbolId], str] = ( lambda sid: f"{sid.tex_path}-{sid.equation_index}-{sid.symbol_index}") sentence_id = (f"{context.tex_path}-{context.sentence_id}" if context is not None else None) parent_id: Optional[str] = None for other_symbol_with_id in symbols_with_ids: other_symbol_id = other_symbol_with_id.symbol_id other_symbol = other_symbol_with_id.symbol try: other_symbol.children.index(symbol) parent_id = create_symbol_id_string(other_symbol_id) except ValueError: continue child_ids = [] for child_symbol in symbol.children: child_symbol_id = symbol_ids_by_symbol_object_ids[id( child_symbol)] string_id = create_symbol_id_string(child_symbol_id) child_ids.append(string_id) relationships: EntityRelationships = { "equation": EntityReference( type_="equation", id_=f"{symbol_id.tex_path}-{symbol_id.equation_index}", ), "parent": EntityReference(type_="symbol", id_=parent_id), "children": [ EntityReference(type_="symbol", id_=id_) for id_ in child_ids ], "sentence": EntityReference(type_="sentence", id_=sentence_id) if sentence_id is not None else EntityReference(type_="sentence", id_=None), "defining_formula_equations": [ EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids ], "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in other_context_sentence_ids ], # "snippet_sentence": EntityReference( # type_="sentence", id_=f"{symbol_id.tex_path}-f{context.sentence_id}" # ) # if context is not None # else None, # "formula_equation": EntityReference( # type_="equation", # id_=f"{symbol_id.tex_path}-f{formula.equation_id}" # if formula is not None # else None, # ), } entity_information = EntityInformation( id_= f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}", type_="symbol", bounding_boxes=[box], data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)
def upload_symbols( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: arxiv_id = processing_summary.arxiv_id entities = [es.entity for es in processing_summary.entities] symbols = cast(List[SerializableSymbol], entities) symbols_by_id = {sid(s): s for s in symbols} entity_infos: List[EntityUploadInfo] = [] # Load MathML matches for partially matching of symbols. matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id), "matches.csv", ) if os.path.exists(matches_path): for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) else: logging.warning( "Could not find symbol matches information for paper %s.", arxiv_id, ) # Load parent-child relationships for symbols. children: Dict[SymbolId, List[SymbolId]] = defaultdict(list) parents: Dict[SymbolId, SymbolId] = {} children_path = os.path.join( directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv" ) if os.path.exists(children_path): for parent in file_utils.load_from_csv(children_path, SerializableChild): pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}" cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}" parents[cid] = pid children[pid].append(cid) else: logging.warning( "Could not find file mapping from symbol to their children for paper %s.", arxiv_id, ) # Load contexts that the symbols appear in. Sort them by the symbol MathML. context_data_missing = False contexts_path = os.path.join( directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv", ) if not os.path.exists(contexts_path): logging.warning( # pylint: disable=logging-not-lazy "Contexts have not been found for symbols for arXiv paper %s. " + "Symbol data will be uploaded without contexts.", arxiv_id, ) context_data_missing = True symbol_contexts = {} mathml_contexts = defaultdict(list) if not context_data_missing: for context in file_utils.load_from_csv(contexts_path, Context): tex_path = context.tex_path symbol_id = f"{tex_path}-{context.entity_id}" symbol_contexts[symbol_id] = context symbol = symbols_by_id[symbol_id] mathml_contexts[symbol.mathml].append(context) # Prepare collections of formulae that each symbol was found in. symbol_formulas = {} mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set) for symbol in symbols: if ( symbol.is_definition and symbol.equation is not None and symbol.relative_start is not None and symbol.relative_end is not None ): highlighted = wrap_span( symbol.equation, symbol.relative_start, symbol.relative_end, before=r"\htmlClass{match-highlight}{", after="}", braces=True, ) formula = DefiningFormula( tex=highlighted, tex_path=symbol.tex_path, equation_id=str(symbol.equation_index), ) symbol_formulas[sid(symbol)] = formula mathml_formulas[symbol.mathml].add(formula) entity_infos = [] for localized_entity in processing_summary.entities: symbol = cast(SerializableSymbol, localized_entity.entity) boxes = [ BoundingBox(l.left, l.top, l.width, l.height, l.page) for l in localized_entity.locations ] # Get context and formula of the symbol, and other matching ones. symbol_context = symbol_contexts.get(sid(symbol)) matching_contexts = mathml_contexts.get(symbol.mathml, []) other_context_texs = [] other_context_sentence_ids = [] for c in matching_contexts: matching_sentence_id = f"{c.tex_path}-{c.sentence_id}" if matching_sentence_id not in other_context_sentence_ids: other_context_texs.append(c.snippet) other_context_sentence_ids.append(matching_sentence_id) matching_formulas = mathml_formulas.get(symbol.mathml, set()) other_formula_texs = [] other_formula_ids = [] for f in matching_formulas: equation_id = f"{f.tex_path}-{f.equation_id}" if equation_id not in other_formula_ids: other_formula_texs.append(f.tex) other_formula_ids.append(equation_id) # Package up data for the symbol. tags: List[str] = [] MAX_BOX_HEIGHT = 0.1 for b in boxes: if b.height > MAX_BOX_HEIGHT: logging.debug( # pylint: disable=logging-not-lazy "Detected large bounding box for symbol with height %f for entity %s of paper " + "%s. Entity will be given a tag indicating it is unexpectedly large.", b.height, f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}", arxiv_id, ) tags.append("large") break data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "type": symbol.type_, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], "snippets": other_context_texs, "defining_formulas": other_formula_texs, "is_definition": symbol.is_definition or False, "tags": tags, } # Create links between this symbol, its sentence, and related symbols. sentence_id = ( f"{symbol_context.tex_path}-{symbol_context.sentence_id}" if symbol_context is not None else None ) parent_id = parents.get(sid(symbol)) child_ids = children.get(sid(symbol), []) relationships: EntityRelationships = { "equation": EntityReference( type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}", ), "parent": EntityReference(type_="symbol", id_=parent_id), "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids], "sentence": EntityReference(type_="sentence", id_=sentence_id) if sentence_id is not None else EntityReference(type_="sentence", id_=None), "defining_formula_equations": [ EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids ], "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in other_context_sentence_ids ], } # Save all data for this symbol entity_information = EntityUploadInfo( id_=sid(symbol), type_="symbol", bounding_boxes=boxes, data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities( processing_summary.s2_id, arxiv_id, entity_infos, data_version, )
def upload_terms(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: arxiv_id = processing_summary.arxiv_id contexts = file_utils.load_from_csv( os.path.join( directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id), "contexts.csv", ), Context, ) contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts} # Assemble contexts that should be shown for each term. contexts_by_term: Dict[str, List[Context]] = defaultdict(list) for entity_and_location in processing_summary.localized_entities: term = cast(Term, entity_and_location.entity) if (term.tex_path, term.id_) in contexts_by_entity: contexts_by_term[term.text].append( contexts_by_entity[(term.tex_path, term.id_)]) entity_infos = [] for entity_and_location in processing_summary.localized_entities: term = cast(Term, entity_and_location.entity) context = contexts_by_entity.get((term.tex_path, term.id_)) boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] # Cluster bounding boxes, in case any of these terms are defined as a macro (in which) # case all appearances of that term on the same page will have been lumped together. clusters = cluster_boxes(boxes, vertical_split=0.005) for i, cluster in enumerate(clusters): entity_info = EntityInformation( id_=f"{term.tex_path}-{term.id_}-{i}", type_="term", bounding_boxes=list(cluster), data={ "name": term.text, "definitions": term.definitions, "definition_texs": term.definitions, "sources": term.sources, "snippets": [c.snippet for c in contexts_by_term.get(term.text, [])], }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{context.tex_path}-{context.sentence_id}" if context is not None else None, ), "snippet_sentences": [ EntityReference(type_="sentence", id_=f"{c.tex_path}-{c.sentence_id}") for c in contexts_by_term.get(term.text, []) ], }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def upload_definitions(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: term_infos = [] definition_infos = [] for entity_and_location in processing_summary.localized_entities: boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] entity = entity_and_location.entity if entity.id_.startswith("definition"): definition = cast(Definition, entity) definition_info = EntityInformation( id_=definition.id_, type_="definition", bounding_boxes=boxes, data={ "definiendum": definition.definiendum, "definition": definition.text, "tex": definition.tex, }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{definition.tex_path}-{definition.sentence_id}" if definition.sentence_id is not None else None, ), }, ) definition_infos.append(definition_info) if entity.id_.startswith("definiendum") or entity.id_.startswith( "term-reference"): term = cast(TermReference, entity) term_info = EntityInformation( id_=term.id_, type_="term", bounding_boxes=boxes, data={ "name": term.text, "definitions": term.definitions, "definition_texs": term.definition_texs, "sources": term.sources, "term_type": term.type_ or "unknown" }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{term.tex_path}-{term.sentence_id}" if term.sentence_id is not None else None, ), "definitions": [ EntityReference(type_="definition", id_=d) for d in term.definition_ids ], }, ) term_infos.append(term_info) # Upload definitions before terms, because terms hold references to definitions that can # only be resolved once the definitions have been uploaded. upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, definition_infos, data_version, ) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, term_infos, data_version, )
def save(self, item: SymbolData, _: None) -> None: symbols_with_ids = item.symbols_with_ids boxes = item.boxes matches = item.matches symbol_sentences = item.symbol_sentences symbol_ids_by_symbol_object_ids = {} for symbol_with_id in symbols_with_ids: symbol_ids_by_symbol_object_ids[id( symbol_with_id.symbol)] = symbol_with_id.symbol_id entity_infos = [] for symbol_with_id in symbols_with_ids: symbol = symbol_with_id.symbol symbol_id = symbol_with_id.symbol_id box = boxes.get(symbol_id) if box is None: continue data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], } sentence_key = symbol_sentences.get(symbol_id) sentence_id = ( f"{sentence_key.tex_path}-{sentence_key.sentence_id}" if sentence_key is not None else None) child_ids = [] for child_symbol in symbol.children: child_symbol_id = symbol_ids_by_symbol_object_ids[id( child_symbol)] string_id = f"{child_symbol_id.tex_path}-{child_symbol_id.equation_index}-{child_symbol_id.symbol_index}" child_ids.append(string_id) relationships: EntityRelationships = { "children": [ EntityReference(type_="symbol", id_=id_) for id_ in child_ids ], "sentence": EntityReference(type_="sentence", id_=None) if sentence_id is None else EntityReference(type_="sentence", id_=sentence_id), } entity_information = EntityInformation( id_= f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}", type_="symbol", bounding_boxes=[box], data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)
def upload_term_definitions( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: " Upload textual terms and their definitions. " # Group contextual snippets for each term. term_infos = [] contexts_by_term_name: Dict[TermName, List[Context]] = defaultdict(list) for entity_summary in processing_summary.entities: entity = entity_summary.entity context = entity_summary.context if is_textual_term(entity) and context is not None: contexts_by_term_name[entity.text].append(context) # type: ignore # Construct mapping from definitions to the sentences that contain them. contexts_by_definition: Dict[EntityId, Context] = {} for entity_summary in processing_summary.entities: entity_id = entity_summary.entity.id_ context = entity_summary.context if (entity_id.startswith("definition")) and context is not None: contexts_by_definition[entity_id] = context # Upload information for each term. for entity_summary in processing_summary.entities: boxes = [cast(BoundingBox, l) for l in entity_summary.locations] entity = entity_summary.entity context = entity_summary.context if not is_textual_term(entity): continue term = cast(TermReference, entity) # Assemble list of snippets that include this term. contexts_matching_term = contexts_by_term_name.get(term.text, []) snippets = [c.snippet for c in contexts_matching_term] snippet_sentences = [ f"{c.tex_path}-{c.sentence_id}" for c in contexts_matching_term ] # Create links to the sentences containing definitions for this term. definition_sentences: List[Optional[str]] = [] for definition_id in term.definition_ids: if definition_id not in contexts_by_definition: definition_sentences.append(None) definition_context = contexts_by_definition[definition_id] definition_sentences.append( f"{definition_context.tex_path}-{definition_context.sentence_id}" ) term_info = EntityUploadInfo( id_=term.id_, type_="term", bounding_boxes=boxes, data={ "name": term.text, "term_type": term.type_ or "unknown", "definitions": term.definitions, "definition_texs": term.definition_texs, "sources": term.sources, # A list of all other sentences the term appearse elsewhere in the paper. "snippets": snippets, }, relationships={ # Link the term to the sentence it belongs to. This link is necessary to enable # visual filtering in the UI where, when a term is clicked, the sentence is # highlighted and all others are lowlighted. "sentence": EntityReference( type_="sentence", id_=f"{context.tex_path}-{context.sentence_id}" if context is not None else None, ), # IDs of the sentences that contain each of the definitions for a term. These IDs # can be used to establish links that take a user to the site of a definition. "definition_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in definition_sentences ], # The IDs of each sentence where the term appears elsewhere in the paper (i.e., # for each of the 'snippets' in the entity data above. Used to link from a snippet # that is shown in a list of snippets to where that snippet appears in the paper. "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in snippet_sentences ], }, ) term_infos.append(term_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, term_infos, data_version, )
def upload_symbol_definitions( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: " Upload symbols and their definitions. " # Associate definitions with symbols as follows: # Definitions will be associated with entire equations as per the current implementation # of the definition detector. Conservatively, associate a definition for an equation # with a single symbol only if that symbol is the *only* top-level symbol in that equation. # Load symbols from files. Group symbols by equation to make it easy to detect whether a # symbol is the only top-level symbol in the equation. symbols_by_equation: Dict[ Tuple[TexPath, EquationIndex], List[Symbol] ] = defaultdict(list) symbols: List[Symbol] = [] symbols_with_ids = file_utils.load_symbols(processing_summary.arxiv_id) if symbols_with_ids is None: logging.info( # pylint: disable=logging-not-lazy "No symbols were loaded for paper %s. Therefore, no definitions for symbols " + "will be uploaded for this paper.", processing_summary.arxiv_id, ) return for _, symbol in symbols_with_ids: symbols_by_equation[symbol.tex_path, symbol.equation_index].append(symbol) symbols.append(symbol) # Group symbols by their MathML. These groups will be used to propagate definitions from # one defined symbol to all other appearances of that symbol. symbols_by_mathml: Dict[MathML, List[Symbol]] = defaultdict(list) for symbol in symbols: symbols_by_mathml[symbol.mathml].append(symbol) # Construct map from definitions to the sentences that contain them. contexts_by_definition: Dict[EntityId, Context] = {} for entity_summary in processing_summary.entities: entity_id = entity_summary.entity.id_ context = entity_summary.context if (entity_id.startswith("definition")) and context is not None: contexts_by_definition[entity_id] = context # Fetch rows for all entities for this paper that have already been uploaded to the database. # This allows lookup of the row IDs for the sentence that contain definitions of symbols. entity_models = fetch_entity_models(processing_summary.s2_id, data_version) # Create a list of rows to insert into the database containing definition data. entity_data_models: List[EntityDataModel] = [] for entity_summary in processing_summary.entities: entity = entity_summary.entity if not entity.id_.startswith("definiendum"): continue # Attempt to match definienda (defined terms) to symbols that are being defined. definiendum = cast(Definiendum, entity) defined_symbol = None for symbol in symbols: # Is the definiendum an equation? if definiendum.type_ != "symbol": continue # Does the symbol fall within in the range of characters being defined? if symbol.start < definiendum.start or symbol.end > definiendum.end: continue # Is the symbol a top-level symbol? if symbol.parent is not None: continue # Is it the *only* top-level symbol in its equation? top_level_symbols_in_equation = filter( lambda s: s.parent is not None, symbols_by_equation[(symbol.tex_path, symbol.equation_index)], ) if len(list(top_level_symbols_in_equation)) > 1: continue defined_symbol = symbol logging.debug( # pylint: disable=logging-not-lazy "Matched definiedum %s at position (%d, %d) to symbol %s at position " + "(%s, %s) for paper %s. A definition for this symbol will be uploaded.", definiendum.tex, definiendum.start, definiendum.end, symbol.tex, symbol.start, symbol.end, processing_summary.arxiv_id, ) break if defined_symbol is None: continue # Assemble data about definitions for the symbol. definitions = definiendum.definitions definition_texs = definiendum.definition_texs sources = definiendum.sources definition_sentence_ids: List[Optional[str]] = [] for definition_id in definiendum.definition_ids: context = contexts_by_definition.get(definition_id) if context is None: definition_sentence_ids.append(None) else: definition_sentence_ids.append( f"{context.tex_path}-{context.sentence_id}" ) # Find all symbols that are the same (i.e., that have the same MathML representation). # Then save definition data so that it applies all of those symbols. matching_symbols = symbols_by_mathml.get(defined_symbol.mathml) if matching_symbols is not None: for s in matching_symbols: entity_model = entity_models.get(("symbol", sid(s))) data: EntityData = { "definitions": definitions, "definition_texs": definition_texs, "sources": sources, } entity_data_models.extend(make_data_models(None, entity_model, data)) relationships: EntityRelationships = { "definition_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in definition_sentence_ids ], } entity_data_models.extend( make_relationship_models( ("symbol", sid(s)), relationships, entity_models ) ) with output_database.atomic(): EntityDataModel.bulk_create(entity_data_models, 200)