def upload_sentences(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: entity_infos = [] for entity_and_location in processing_summary.localized_entities: sentence = cast(SentenceEntity, entity_and_location.entity) boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] entity_info = EntityInformation( id_=f"{sentence.tex_path}-{sentence.id_}", type_="sentence", bounding_boxes=boxes, data={ "text": sentence.text, "tex": sentence.tex, "tex_start": sentence.start, "tex_end": sentence.end, }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def upload_terms(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: entity_infos = [] for entity_and_location in processing_summary.localized_entities: term = cast(TermEntity, entity_and_location.entity) boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] entity_info = EntityInformation( id_=f"{term.tex_path}-{term.id_}", type_="term", bounding_boxes=boxes, data={ "name": term.name, "definitions": term.definitions, "sources": term.sources, "val": term.val, }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def save(self, item: CitationData, _: None) -> None: citation_locations = item.citation_locations key_s2_ids = item.key_s2_ids entity_infos = [] citation_index = 0 for citation_key, locations in citation_locations.items(): if citation_key not in key_s2_ids: logging.warning( # pylint: disable=logging-not-lazy "Not uploading bounding box information for citation with key " + "%s because it was not resolved to a paper S2 ID.", citation_key, ) continue for cluster_index, location_set in locations.items(): boxes = cast(List[BoundingBox], list(location_set)) entity_info = EntityInformation( id_=f"{citation_key}-{cluster_index}", type_="citation", bounding_boxes=boxes, data={ "key": citation_key, "paper_id": key_s2_ids[citation_key] }, ) entity_infos.append(entity_info) citation_index += 1 upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)
def upload_equations(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: entity_infos = [] for entity_and_location in processing_summary.localized_entities: equation = cast(Equation, entity_and_location.entity) boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] entity_info = EntityInformation( id_=f"{equation.tex_path}-{equation.id_}", type_="equation", bounding_boxes=boxes, data={"tex": equation.tex}, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def save(self, item: SymbolData, _: None) -> None: symbols_with_ids = item.symbols_with_ids boxes = item.boxes matches = item.matches symbol_contexts = item.symbol_contexts mathml_contexts = item.mathml_contexts symbol_formulas = item.symbol_formulas mathml_formulas = item.mathml_formulas symbol_ids_by_symbol_object_ids = {} for symbol_with_id in symbols_with_ids: symbol_ids_by_symbol_object_ids[id( symbol_with_id.symbol)] = symbol_with_id.symbol_id entity_infos = [] for symbol_with_id in symbols_with_ids: symbol = symbol_with_id.symbol # TODO(andrewhead): move this filtering condition into 'parse_equation' if symbol.tex in ["$|$", "|"]: continue symbol_id = symbol_with_id.symbol_id # Get context and formula of the symbol, and other matching ones. context = symbol_contexts.get(symbol_id) matching_contexts = mathml_contexts.get(symbol.mathml, []) other_context_texs = [] other_context_sentence_ids = [] for c in matching_contexts: matching_sentence_id = f"{c.tex_path}-{c.sentence_id}" if (matching_sentence_id not in other_context_sentence_ids # and c.sentence_id != context.sentence_id ): other_context_texs.append(c.snippet) other_context_sentence_ids.append(matching_sentence_id) formula = symbol_formulas.get(symbol_id) matching_formulas = mathml_formulas.get(symbol.mathml, []) other_formula_texs = [] other_formula_ids = [] for f in matching_formulas: equation_id = f"{f.tex_path}-{f.equation_id}" if equation_id not in other_formula_ids: # and ( # : formula is None or equation_id != formula.equation_id # ) other_formula_texs.append(f.tex) other_formula_ids.append(equation_id) box = boxes.get(symbol_id) if box is None: continue data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], # "snippet": context.snippet, "snippets": other_context_texs, "defining_formulas": other_formula_texs, "is_definition": symbol.is_definition or False, } # if formula is not None: # data['formula'] = formula.tex create_symbol_id_string: Callable[[SymbolId], str] = ( lambda sid: f"{sid.tex_path}-{sid.equation_index}-{sid.symbol_index}") sentence_id = (f"{context.tex_path}-{context.sentence_id}" if context is not None else None) parent_id: Optional[str] = None for other_symbol_with_id in symbols_with_ids: other_symbol_id = other_symbol_with_id.symbol_id other_symbol = other_symbol_with_id.symbol try: other_symbol.children.index(symbol) parent_id = create_symbol_id_string(other_symbol_id) except ValueError: continue child_ids = [] for child_symbol in symbol.children: child_symbol_id = symbol_ids_by_symbol_object_ids[id( child_symbol)] string_id = create_symbol_id_string(child_symbol_id) child_ids.append(string_id) relationships: EntityRelationships = { "equation": EntityReference( type_="equation", id_=f"{symbol_id.tex_path}-{symbol_id.equation_index}", ), "parent": EntityReference(type_="symbol", id_=parent_id), "children": [ EntityReference(type_="symbol", id_=id_) for id_ in child_ids ], "sentence": EntityReference(type_="sentence", id_=sentence_id) if sentence_id is not None else EntityReference(type_="sentence", id_=None), "defining_formula_equations": [ EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids ], "snippet_sentences": [ EntityReference(type_="sentence", id_=id_) for id_ in other_context_sentence_ids ], # "snippet_sentence": EntityReference( # type_="sentence", id_=f"{symbol_id.tex_path}-f{context.sentence_id}" # ) # if context is not None # else None, # "formula_equation": EntityReference( # type_="equation", # id_=f"{symbol_id.tex_path}-f{formula.equation_id}" # if formula is not None # else None, # ), } entity_information = EntityInformation( id_= f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}", type_="symbol", bounding_boxes=[box], data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)
def upload_terms(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: arxiv_id = processing_summary.arxiv_id contexts = file_utils.load_from_csv( os.path.join( directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id), "contexts.csv", ), Context, ) contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts} # Assemble contexts that should be shown for each term. contexts_by_term: Dict[str, List[Context]] = defaultdict(list) for entity_and_location in processing_summary.localized_entities: term = cast(Term, entity_and_location.entity) if (term.tex_path, term.id_) in contexts_by_entity: contexts_by_term[term.text].append( contexts_by_entity[(term.tex_path, term.id_)]) entity_infos = [] for entity_and_location in processing_summary.localized_entities: term = cast(Term, entity_and_location.entity) context = contexts_by_entity.get((term.tex_path, term.id_)) boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] # Cluster bounding boxes, in case any of these terms are defined as a macro (in which) # case all appearances of that term on the same page will have been lumped together. clusters = cluster_boxes(boxes, vertical_split=0.005) for i, cluster in enumerate(clusters): entity_info = EntityInformation( id_=f"{term.tex_path}-{term.id_}-{i}", type_="term", bounding_boxes=list(cluster), data={ "name": term.text, "definitions": term.definitions, "definition_texs": term.definitions, "sources": term.sources, "snippets": [c.snippet for c in contexts_by_term.get(term.text, [])], }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{context.tex_path}-{context.sentence_id}" if context is not None else None, ), "snippet_sentences": [ EntityReference(type_="sentence", id_=f"{c.tex_path}-{c.sentence_id}") for c in contexts_by_term.get(term.text, []) ], }, ) entity_infos.append(entity_info) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, entity_infos, data_version, )
def upload_definitions(processing_summary: PaperProcessingResult, data_version: Optional[int]) -> None: term_infos = [] definition_infos = [] for entity_and_location in processing_summary.localized_entities: boxes = [cast(BoundingBox, l) for l in entity_and_location.locations] entity = entity_and_location.entity if entity.id_.startswith("definition"): definition = cast(Definition, entity) definition_info = EntityInformation( id_=definition.id_, type_="definition", bounding_boxes=boxes, data={ "definiendum": definition.definiendum, "definition": definition.text, "tex": definition.tex, }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{definition.tex_path}-{definition.sentence_id}" if definition.sentence_id is not None else None, ), }, ) definition_infos.append(definition_info) if entity.id_.startswith("definiendum") or entity.id_.startswith( "term-reference"): term = cast(TermReference, entity) term_info = EntityInformation( id_=term.id_, type_="term", bounding_boxes=boxes, data={ "name": term.text, "definitions": term.definitions, "definition_texs": term.definition_texs, "sources": term.sources, "term_type": term.type_ or "unknown" }, relationships={ "sentence": EntityReference( type_="sentence", id_=f"{term.tex_path}-{term.sentence_id}" if term.sentence_id is not None else None, ), "definitions": [ EntityReference(type_="definition", id_=d) for d in term.definition_ids ], }, ) term_infos.append(term_info) # Upload definitions before terms, because terms hold references to definitions that can # only be resolved once the definitions have been uploaded. upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, definition_infos, data_version, ) upload_entities( processing_summary.s2_id, processing_summary.arxiv_id, term_infos, data_version, )
def save(self, item: SymbolData, _: None) -> None: symbols_with_ids = item.symbols_with_ids boxes = item.boxes matches = item.matches symbol_sentences = item.symbol_sentences symbol_ids_by_symbol_object_ids = {} for symbol_with_id in symbols_with_ids: symbol_ids_by_symbol_object_ids[id( symbol_with_id.symbol)] = symbol_with_id.symbol_id entity_infos = [] for symbol_with_id in symbols_with_ids: symbol = symbol_with_id.symbol symbol_id = symbol_with_id.symbol_id box = boxes.get(symbol_id) if box is None: continue data: EntityData = { "tex": f"${symbol.tex}$", "tex_start": symbol.start, "tex_end": symbol.end, "mathml": symbol.mathml, "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]], } sentence_key = symbol_sentences.get(symbol_id) sentence_id = ( f"{sentence_key.tex_path}-{sentence_key.sentence_id}" if sentence_key is not None else None) child_ids = [] for child_symbol in symbol.children: child_symbol_id = symbol_ids_by_symbol_object_ids[id( child_symbol)] string_id = f"{child_symbol_id.tex_path}-{child_symbol_id.equation_index}-{child_symbol_id.symbol_index}" child_ids.append(string_id) relationships: EntityRelationships = { "children": [ EntityReference(type_="symbol", id_=id_) for id_ in child_ids ], "sentence": EntityReference(type_="sentence", id_=None) if sentence_id is None else EntityReference(type_="sentence", id_=sentence_id), } entity_information = EntityInformation( id_= f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}", type_="symbol", bounding_boxes=[box], data=data, relationships=relationships, ) entity_infos.append(entity_information) upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)