def save(self, item: CitationData, _: None) -> None: citation_locations = item.citation_locations key_s2_ids = item.key_s2_ids entity_infos = [] citation_index = 0 for citation_key, locations in citation_locations.items(): if citation_key not in key_s2_ids: logging.warning( # pylint: disable=logging-not-lazy "Not uploading bounding box information for citation with key " + "%s because it was not resolved to a paper S2 ID.", citation_key, ) continue for cluster_index, location_set in locations.items(): boxes = cast(List[BoundingBox], list(location_set)) entity_info = EntityUploadInfo( id_=f"{citation_key}-{cluster_index}", type_="citation", bounding_boxes=boxes, data={"key": citation_key, "paper_id": key_s2_ids[citation_key]}, ) entity_infos.append(entity_info) citation_index += 1 upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)
def upload_sentences(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: entity_infos = [] for entity_summary in processing_summary.entities: sentence = cast(SentenceEntity, entity_summary.entity) boxes = [cast(BoundingBox, l) for l in entity_summary.locations] entity_info = EntityUploadInfo( id_=f"{sentence.tex_path}-{sentence.id_}", type_="sentence", bounding_boxes=boxes, data={ "text": sentence.text, "tex": sentence.tex, "tex_start": sentence.start, "tex_end": sentence.end, }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def upload_equations(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: entity_infos = [] for entity_summary in processing_summary.entities: equation = cast(Equation, entity_summary.entity) boxes = [cast(BoundingBox, l) for l in entity_summary.locations] entity_info = EntityUploadInfo( id_=f"{equation.tex_path}-{equation.id_}", type_="equation", bounding_boxes=boxes, data={"tex": equation.tex}, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def upload_symbols( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: arxiv_id = processing_summary.arxiv_id entities = [es.entity for es in processing_summary.entities] symbols = cast(List[SerializableSymbol], entities) symbols_by_id = {sid(s): s for s in symbols} entity_infos: List[EntityUploadInfo] = [] # Load MathML matches for partially matching of symbols. matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id), "matches.csv", ) if os.path.exists(matches_path): for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) else: logging.warning( "Could not find symbol matches information for paper %s.", arxiv_id, ) # Load parent-child relationships for symbols. children: Dict[SymbolId, List[SymbolId]] = defaultdict(list) parents: Dict[SymbolId, SymbolId] = {} children_path = os.path.join( directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv" ) if os.path.exists(children_path): for parent in file_utils.load_from_csv(children_path, SerializableChild): pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}" cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}" parents[cid] = pid children[pid].append(cid) else: logging.warning( "Could not find file mapping from symbol to their children for paper %s.", arxiv_id, ) # Load contexts that the symbols appear in. Sort them by the symbol MathML. context_data_missing = False contexts_path = os.path.join( directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv", ) if not os.path.exists(contexts_path): logging.warning( # pylint: disable=logging-not-lazy "Contexts have not been found for symbols for arXiv paper %s. " + "Symbol data will be uploaded without contexts.", arxiv_id, ) context_data_missing = True symbol_contexts = {} mathml_contexts = defaultdict(list) if not context_data_missing: for context in file_utils.load_from_csv(contexts_path, Context): tex_path = context.tex_path symbol_id = f"{tex_path}-{context.entity_id}" symbol_contexts[symbol_id] = context symbol = symbols_by_id[symbol_id] mathml_contexts[symbol.mathml].append(context) # Prepare collections of formulae that each symbol was found in. symbol_formulas = {} mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set) for symbol in symbols: if ( symbol.is_definition and symbol.equation is not None and symbol.relative_start is not None and symbol.relative_end is not None ): highlighted = wrap_span( symbol.equation, symbol.relative_start, symbol.relative_end, before=r"\htmlClass{match-highlight}{", after="}", braces=True, ) formula = DefiningFormula( tex=highlighted, tex_path=symbol.tex_path, equation_id=str(symbol.equation_index), ) symbol_formulas[sid(symbol)] = formula mathml_formulas[symbol.mathml].add(formula) entity_infos = [] for localized_entity in processing_summary.entities: symbol = cast(SerializableSymbol, localized_entity.entity) boxes = [ BoundingBox(l.left, l.top, l.width, l.height, l.page) for l in localized_entity.locations ] # Get context and formula of the symbol, and other matching ones. symbol_context = symbol_contexts.get(sid(symbol)) matching_contexts = mathml_contexts.get(symbol.mathml, []) other_context_texs = [] other_context_sentence_ids = [] for c in matching_contexts: matching_sentence_id = f"{c.tex_path}-{c.sentence_id}" if matching_sentence_id not in other_context_sentence_ids: other_context_texs.append(c.snippet) other_context_sentence_ids.append(matching_sentence_id) matching_formulas = mathml_formulas.get(symbol.mathml, set()) other_formula_texs = [] other_formula_ids = [] for f in matching_formulas: equation_id = f"{f.tex_path}-{f.equation_id}" if equation_id not in other_formula_ids: other_formula_texs.append(f.tex) other_formula_ids.append(equation_id) # Package up data for the symbol. tags: List[str] = [] MAX_BOX_HEIGHT = 0.1 for b in boxes: if b.height > MAX_BOX_HEIGHT: logging.debug( # pylint: disable=logging-not-lazy "Detected large bounding box for symbol with height %f for entity %s of paper " + "%s. Entity will be given a tag indicating it is unexpectedly large.", b.height, f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}", arxiv_id, ) tags.append("large") break data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "type": symbol.type_, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], "snippets": other_context_texs, "defining_formulas": other_formula_texs, "is_definition": symbol.is_definition or False, "tags": tags, } # Create links between this symbol, its sentence, and related symbols. sentence_id = ( f"{symbol_context.tex_path}-{symbol_context.sentence_id}" if symbol_context is not None else None ) parent_id = parents.get(sid(symbol)) child_ids = children.get(sid(symbol), []) relationships: EntityRelationships = { "equation": EntityReference( type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}", ), "parent": EntityReference(type_="symbol", id_=parent_id), "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids], "sentence": EntityReference(type_="sentence", id_=sentence_id) if sentence_id is not None else EntityReference(type_="sentence", id_=None), "defining_formula_equations": [ EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids ], "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in other_context_sentence_ids ], } # Save all data for this symbol entity_information = EntityUploadInfo( id_=sid(symbol), type_="symbol", bounding_boxes=boxes, data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities( processing_summary.s2_id, arxiv_id, entity_infos, data_version, )
def upload_terms(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: arxiv_id = processing_summary.arxiv_id contexts = file_utils.load_from_csv( os.path.join( directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id), "contexts.csv", ), Context, ) contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts} # Assemble contexts that should be shown for each term. contexts_by_term: Dict[str, List[Context]] = defaultdict(list) for entity_summary in processing_summary.entities: term = cast(Term, entity_summary.entity) if (term.tex_path, term.id_) in contexts_by_entity: contexts_by_term[term.text].append( contexts_by_entity[(term.tex_path, term.id_)]) entity_infos = [] for entity_summary in processing_summary.entities: term = cast(Term, entity_summary.entity) context = contexts_by_entity.get((term.tex_path, term.id_)) boxes = [cast(BoundingBox, l) for l in entity_summary.locations] # Cluster bounding boxes, in case any of these terms are defined as a macro (in which) # case all appearances of that term on the same page will have been lumped together. clusters = cluster_boxes(boxes, vertical_split=0.005) for i, cluster in enumerate(clusters): entity_info = EntityUploadInfo( id_=f"{term.tex_path}-{term.id_}-{i}", type_="term", bounding_boxes=list(cluster), data={ "name": term.text, "definitions": term.definitions, "definition_texs": term.definitions, "sources": term.sources, "snippets": [c.snippet for c in contexts_by_term.get(term.text, [])], }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{context.tex_path}-{context.sentence_id}" if context is not None else None, ), "snippet_sentences": [ EntityReference(type_="sentence", id_=f"{c.tex_path}-{c.sentence_id}") for c in contexts_by_term.get(term.text, []) ], }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def upload_term_definitions( processing_summary: PaperProcessingResult, data_version: Optional[int] ) -> None: " Upload textual terms and their definitions. " # Group contextual snippets for each term. term_infos = [] contexts_by_term_name: Dict[TermName, List[Context]] = defaultdict(list) for entity_summary in processing_summary.entities: entity = entity_summary.entity context = entity_summary.context if is_textual_term(entity) and context is not None: contexts_by_term_name[entity.text].append(context) # type: ignore # Construct mapping from definitions to the sentences that contain them. contexts_by_definition: Dict[EntityId, Context] = {} for entity_summary in processing_summary.entities: entity_id = entity_summary.entity.id_ context = entity_summary.context if (entity_id.startswith("definition")) and context is not None: contexts_by_definition[entity_id] = context # Upload information for each term. for entity_summary in processing_summary.entities: boxes = [cast(BoundingBox, l) for l in entity_summary.locations] entity = entity_summary.entity context = entity_summary.context if not is_textual_term(entity): continue term = cast(TermReference, entity) # Assemble list of snippets that include this term. contexts_matching_term = contexts_by_term_name.get(term.text, []) snippets = [c.snippet for c in contexts_matching_term] snippet_sentences = [ f"{c.tex_path}-{c.sentence_id}" for c in contexts_matching_term ] # Create links to the sentences containing definitions for this term. definition_sentences: List[Optional[str]] = [] for definition_id in term.definition_ids: if definition_id not in contexts_by_definition: definition_sentences.append(None) definition_context = contexts_by_definition[definition_id] definition_sentences.append( f"{definition_context.tex_path}-{definition_context.sentence_id}" ) term_info = EntityUploadInfo( id_=term.id_, type_="term", bounding_boxes=boxes, data={ "name": term.text, "term_type": term.type_ or "unknown", "definitions": term.definitions, "definition_texs": term.definition_texs, "sources": term.sources, # A list of all other sentences the term appearse elsewhere in the paper. "snippets": snippets, }, relationships={ # Link the term to the sentence it belongs to. This link is necessary to enable # visual filtering in the UI where, when a term is clicked, the sentence is # highlighted and all others are lowlighted. "sentence": EntityReference( type_="sentence", id_=f"{context.tex_path}-{context.sentence_id}" if context is not None else None, ), # IDs of the sentences that contain each of the definitions for a term. These IDs # can be used to establish links that take a user to the site of a definition. "definition_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in definition_sentences ], # The IDs of each sentence where the term appears elsewhere in the paper (i.e., # for each of the 'snippets' in the entity data above. Used to link from a snippet # that is shown in a list of snippets to where that snippet appears in the paper. "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in snippet_sentences ], }, ) term_infos.append(term_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, term_infos, data_version, )