def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]: data_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id) symbols_path = os.path.join(data_dir, "symbols.csv") symbol_tokens_path = os.path.join(data_dir, "symbol_tokens.csv") symbol_children_path = os.path.join(data_dir, "symbol_children.csv") file_not_found = False if not os.path.exists(symbols_path): logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_tokens_path): logging.info("Symbol tokens data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_children_path): logging.info("No symbol children data found for paper %s.", arxiv_id) file_not_found = True if file_not_found: return None loaded_symbols = load_from_csv(symbols_path, SerializableSymbol) loaded_symbol_tokens = load_from_csv(symbol_tokens_path, SerializableSymbolToken) loaded_symbol_children = load_from_csv(symbol_children_path, SerializableChild) symbols_by_id: Dict[SymbolId, Symbol] = {} for s in loaded_symbols: symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index) symbols_by_id[symbol_id] = Symbol(tokens=[], start=s.start, end=s.end, tex=s.tex, mathml=s.mathml, children=[]) for t in loaded_symbol_tokens: symbol_id = SymbolId(t.tex_path, t.equation_index, t.symbol_index) symbols_by_id[symbol_id].tokens.append(t.token_index) for c in loaded_symbol_children: parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index) child_id = SymbolId(c.tex_path, c.equation_index, c.child_index) child_symbol = symbols_by_id[child_id] symbols_by_id[parent_id].children.append(child_symbol) return [ SymbolWithId(symbol_id, symbol) for symbol_id, symbol in symbols_by_id.items() ]
def test_annotate_one_symbol(): tex = "some text... $x$" symbols = {SymbolId(TEX_PATH, 0, 0): Symbol([0], ARBITRARY_MATHML, [])} characters = {CharacterId(TEX_PATH, 0, 0): Character("x", 0, 0, 1)} annotated_tex, _ = annotate_symbols_and_equations_for_file( tex, TEX_PATH, symbols, characters) assert (annotated_tex == "some text... <<equation>>$<<symbol>>x<</symbol>>$<</equation>>")
def test_annotate_nested_symbols(): tex = "$x_i$" x = Symbol([0], ARBITRARY_MATHML, []) i = Symbol([1], ARBITRARY_MATHML, []) x_i = Symbol([0, 1], ARBITRARY_MATHML, children=[x, i]) symbols = { SymbolId(TEX_PATH, 0, 0): x_i, SymbolId(TEX_PATH, 0, 1): x, SymbolId(TEX_PATH, 0, 2): i, } characters = { CharacterId(TEX_PATH, 0, 0): Character("x", 0, 0, 1), CharacterId(TEX_PATH, 0, 1): Character("i", 1, 2, 3), } annotated_tex, _ = annotate_symbols_and_equations_for_file( tex, TEX_PATH, symbols, characters) assert annotated_tex == ("<<equation>>$" + "<<symbol>>" + "<<symbol>>x<</symbol>>" + "_" + "<<symbol>>i<</symbol>>" + "<</symbol>>" + "$<</equation>>")
def symbol_id(): return SymbolId("tex-path", 0, 0)
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]: data_dir = directories.arxiv_subdir("detected-symbols", arxiv_id) symbols_path = os.path.join(data_dir, "entities.csv") symbol_tokens_path = os.path.join(data_dir, "symbol_tokens.csv") symbol_children_path = os.path.join(data_dir, "symbol_children.csv") file_not_found = False if not os.path.exists(symbols_path): logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_tokens_path): logging.info("Symbol tokens data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_children_path): logging.info("No symbol children data found for paper %s.", arxiv_id) file_not_found = True if file_not_found: return None loaded_symbols = load_from_csv(symbols_path, SerializableSymbol) loaded_symbol_tokens = load_from_csv(symbol_tokens_path, SerializableSymbolToken) loaded_symbol_children = load_from_csv(symbol_children_path, SerializableChild) symbols_by_id: Dict[SymbolId, Symbol] = {} for s in loaded_symbols: symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index) symbols_by_id[symbol_id] = Symbol( tokens=[], start=s.start, end=s.end, tex=s.tex, mathml=s.mathml, children=[], is_definition=s.is_definition, equation=s.equation, relative_start=s.relative_start, relative_end=s.relative_end ) for t in loaded_symbol_tokens: symbol_id = SymbolId(t.tex_path, t.equation_index, t.symbol_index) symbols_by_id[symbol_id].tokens.append(t.token_index) for c in loaded_symbol_children: parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index) child_id = SymbolId(c.tex_path, c.equation_index, c.child_index) try: child_symbol = symbols_by_id[child_id] symbols_by_id[parent_id].children.append(child_symbol) except KeyError: logging.warning( # pylint: disable=logging-not-lazy "Could not load child symbol %s for symbol %s for paper %s. " + "There may have been an error in the equation parser, like a failure to " + "find tokens for the child symbol.", child_id, parent_id, arxiv_id, ) return [ SymbolWithId(symbol_id, symbol) for symbol_id, symbol in symbols_by_id.items() ]
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]: tokens_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id) tokens_path = os.path.join(tokens_dir, "entities.csv") symbols_dir = directories.arxiv_subdir("detected-symbols", arxiv_id) symbols_path = os.path.join(symbols_dir, "entities.csv") symbol_tokens_path = os.path.join(symbols_dir, "symbol_tokens.csv") symbol_children_path = os.path.join(symbols_dir, "symbol_children.csv") file_not_found = False if not os.path.exists(tokens_path): logging.info("Tokens data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbols_path): logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_tokens_path): logging.info("Symbol tokens data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_children_path): logging.info("No symbol children data found for paper %s.", arxiv_id) file_not_found = True if file_not_found: return None loaded_tokens = load_from_csv(tokens_path, SerializableToken) loaded_symbols = load_from_csv(symbols_path, SerializableSymbol) loaded_symbol_tokens = load_from_csv(symbol_tokens_path, SerializableSymbolToken) loaded_symbol_children = load_from_csv(symbol_children_path, SerializableChild) tokens_by_id: Dict[TokenId, Token] = {} for t in loaded_tokens: token_id = TokenId(t.tex_path, t.equation_index, t.relative_start, t.relative_end) tokens_by_id[token_id] = Token(start=t.relative_start, end=t.relative_end, text=t.text, type_=t.type_) symbols_by_id: Dict[SymbolId, Symbol] = {} for s in loaded_symbols: symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index) symbols_by_id[symbol_id] = Symbol( tex_path=s.tex_path, equation_index=s.equation_index, symbol_index=s.symbol_index, tokens=[], start=s.start, end=s.end, tex=s.tex, mathml=s.mathml, children=[], parent=None, is_definition=s.is_definition, equation=s.equation, relative_start=s.relative_start, relative_end=s.relative_end, contains_affix=s.contains_affix, ) for st in loaded_symbol_tokens: symbol_id = SymbolId(st.tex_path, st.equation_index, st.symbol_index) token_id = TokenId(st.tex_path, st.equation_index, st.start, st.end) if token_id in tokens_by_id and symbol_id in symbols_by_id: symbols_by_id[symbol_id].tokens.append(tokens_by_id[token_id]) for c in loaded_symbol_children: parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index) child_id = SymbolId(c.tex_path, c.equation_index, c.child_index) try: child_symbol = symbols_by_id[child_id] parent_symbol = symbols_by_id[parent_id] parent_symbol.children.append(child_symbol) child_symbol.parent = parent_symbol except KeyError: logging.warning( # pylint: disable=logging-not-lazy "Could not load child symbol %s or parent symbol %s for paper %s when associating " + "the two as parent and child. There may have been an error in the equation " + "parser, like a failure to find tokens for the child symbol.", child_id, parent_id, arxiv_id, ) return [ SymbolWithId(symbol_id, symbol) for symbol_id, symbol in symbols_by_id.items() ]
def load(self) -> Iterator[SymbolData]: for arxiv_id in self.arxiv_ids: s2_id = get_s2_id(arxiv_id) if s2_id is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids} boxes: Dict[SymbolId, BoundingBox] = {} boxes_path = os.path.join( directories.arxiv_subdir("symbol-locations", arxiv_id), "symbol_locations.csv", ) if not os.path.exists(boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue for location in file_utils.load_from_csv(boxes_path, SymbolLocation): symbol_id = SymbolId( tex_path=location.tex_path, equation_index=location.equation_index, symbol_index=location.symbol_index, ) box = BoundingBox( page=int(location.page), left=location.left, top=location.top, width=location.width, height=location.height, ) boxes[symbol_id] = box matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", arxiv_id), "matches.csv") if not os.path.exists(matches_path): logging.warning( "Could not find symbol matches information for %s. Skipping", arxiv_id, ) continue for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) context_data_missing = False contexts_path = os.path.join( directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv", ) if not os.path.exists(contexts_path): logging.warning( # pylint: disable=logging-not-lazy "Contexts have not been found for symbols for arXiv paper %s. " + "Symbol data will be uploaded without contexts.", arxiv_id, ) context_data_missing = True symbol_contexts = {} mathml_contexts = defaultdict(list) if not context_data_missing: for context in file_utils.load_from_csv( contexts_path, Context): tex_path = context.tex_path equation_index, symbol_index = [ int(t) for t in context.entity_id.split("-") ] symbol_id = SymbolId(tex_path, equation_index, symbol_index) symbol_contexts[symbol_id] = context symbol = symbols_by_id[symbol_id] mathml_contexts[symbol.mathml].append(context) symbol_formulas = {} mathml_formulas = defaultdict(set) for id_, symbol in symbols_by_id.items(): if (symbol.is_definition and symbol.equation is not None and symbol.relative_start is not None and symbol.relative_end is not None): highlighted = wrap_span( symbol.equation, symbol.relative_start, symbol.relative_end, before=r"\htmlClass{match-highlight}{", after="}", braces=True, ) formula = DefiningFormula( tex=highlighted, tex_path=id_.tex_path, equation_id=id_.equation_index, ) symbol_formulas[id_] = formula mathml_formulas[symbol.mathml].add(formula) yield SymbolData( arxiv_id, s2_id, symbols_with_ids, boxes, symbol_contexts, symbol_formulas, mathml_contexts, mathml_formulas, matches, )
def load(self) -> Iterator[SymbolData]: for arxiv_id in self.arxiv_ids: s2_id = get_s2_id(arxiv_id) if s2_id is None: continue symbols_with_ids = file_utils.load_symbols(arxiv_id) if symbols_with_ids is None: continue boxes: Dict[SymbolId, BoundingBox] = {} boxes_path = os.path.join( directories.arxiv_subdir("symbol-locations", arxiv_id), "symbol_locations.csv", ) if not os.path.exists(boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue for location in file_utils.load_from_csv(boxes_path, SymbolLocation): symbol_id = SymbolId( tex_path=location.tex_path, equation_index=location.equation_index, symbol_index=location.symbol_index, ) box = BoundingBox( page=int(location.page), left=location.left, top=location.top, width=location.width, height=location.height, ) boxes[symbol_id] = box matches: Matches = {} matches_path = os.path.join( directories.arxiv_subdir("symbol-matches", arxiv_id), "matches.csv") if not os.path.exists(matches_path): logging.warning( "Could not find symbol matches information for %s. Skipping", arxiv_id, ) continue for match in file_utils.load_from_csv(matches_path, Match): if match.queried_mathml not in matches: matches[match.queried_mathml] = [] matches[match.queried_mathml].append(match) sentence_data_missing = False sentences_path = os.path.join( directories.arxiv_subdir("sentences-for-symbols", arxiv_id), "entity_sentences.csv", ) if not os.path.exists(sentences_path): logging.warning( # pylint: disable=logging-not-lazy "Symbols for arXiv paper %s have not been aligned to sentences. " + "Symbol data will be uploaded without links to sentences", arxiv_id, ) sentence_data_missing = True if not sentence_data_missing: symbol_sentences = {} for pair in file_utils.load_from_csv(sentences_path, EntitySentencePairIds): tex_path = pair.tex_path equation_index, symbol_index = [ int(t) for t in pair.entity_id.split("-") ] sentence_key = SentenceKey(pair.tex_path, pair.sentence_id) symbol_id = SymbolId(tex_path, equation_index, symbol_index) symbol_sentences[symbol_id] = sentence_key yield SymbolData( arxiv_id, s2_id, symbols_with_ids, boxes, symbol_sentences, matches, )