def test_parse_prime(): result = parse_element(load_fragment_tag("x_prime.xml")) assert len(result.symbols) == 1 symbol = result.symbols[0] assert len(symbol.children) == 1 assert str(symbol.children[0].element) == "<mi>x</mi>" assert symbol.tokens == [Token(0, 1, "x", 0), Token(1, 2, "′", 2)]
def _extract_tokens(element: Tag) -> List[Token]: """ Get the tokens defined in this element. Tokens are characters or spans of text that make up symbols. There should be a token returned for each glyph in a symbol that needs to be detected separately (e.g., a symbol's base and its subscript are different tokens). Tokens are only found in low-level elements like "<mi>" and "<mn>". This function will not find tokens in higher-level nodes that solely group other low-level elements (like "<mrow>" and "<msub>"). """ tokens = [] if _is_atomic_token(element): tokens.append( Token( # Convert text to a primitive type. 'element.string' is a NavigableString, # which causes recursion errors when serialized. text=str(element.string), start=int(element["s2:start"]), end=int(element["s2:end"]), type_="atom", ) ) elif _is_affix_token(element): tokens.append( Token( text=str(element.string), start=int(element["s2:start"]), end=int(element["s2:end"]), type_="affix", ) ) return tokens
def test_parse_accent(): result = parse_element(load_fragment_tag("bar_x.xml")) assert len(result.symbols) == 1 assert str(result.element) == '<mover accent="true"><mi>x</mi><mo>ˉ</mo></mover>' symbol = result.symbols[0] assert symbol.contains_affix_token assert symbol.tokens == [Token("x", "atom", 5, 6), Token("ˉ", "affix", 0, 5)]
def test_detect_function_declaration(): result = parse_element(load_fragment_tag("function.xml")) symbol = result.symbols[0] assert symbol.element.text == "p(x;θ,y)" assert symbol.type_ == NodeType.FUNCTION assert symbol.start == 0 assert symbol.end == 16 assert ( Token("(", "atom", 1, 2) in symbol.tokens ), "function tokens should include parentheses" assert ( Token(")", "atom", 15, 16) in symbol.tokens ), "function tokens should include parentheses" assert not any( [t.text == "," for t in symbol.tokens] ), "function tokens should not include commas" assert not any( [t.text == ";" for t in symbol.tokens] ), "function tokens should not include semicolons" child_symbols = symbol.child_symbols assert len(child_symbols) == 4 assert str(child_symbols[0].element) == "<mi>p</mi>" assert str(child_symbols[1].element) == "<mi>x</mi>" assert str(child_symbols[2].element) == "<mi>θ</mi>" assert str(child_symbols[3].element) == "<mi>y</mi>"
def test_parse_single_symbol(): result = parse_element(load_fragment_tag("x.xml")) assert len(result.symbols) == 1 symbol = result.symbols[0] assert str(symbol.element) == "<mi>x</mi>" assert symbol.children == [] assert symbol.tokens == [Token(0, 1, "x", 0)] assert result.tokens == [Token(0, 1, "x", 0)]
def test_parse_node_with_child_nodes(): result = parse_element(load_fragment_tag("x_sub_i.xml")) symbol = result.symbols[0] assert str(symbol.element) == "<msub><mi>x</mi><mi>i</mi></msub>" assert len(symbol.children) == 2 assert str(symbol.children[0].element) == "<mi>x</mi>" assert str(symbol.children[1].element) == "<mi>i</mi>" assert symbol.tokens == [ Token(0, 1, "x", 0), Token(2, 3, "i", 1), ]
def test_parse_single_symbol(): result = parse_element(load_fragment_tag("x.xml")) assert len(result.symbols) == 1 symbol = result.symbols[0] assert str(symbol.element) == "<mi>x</mi>" assert symbol.type_ == NodeType.IDENTIFIER assert symbol.children == [] assert symbol.tokens == [Token("x", "atom", 0, 1)] assert symbol.start == 0 assert symbol.end == 1 assert not symbol.defined assert not symbol.contains_affix_token assert result.tokens == [Token("x", "atom", 0, 1)]
def test_get_token_bounding_box(): s = symbol(tokens=[Token("x", "atom", 0, 1)]) token_locations = { token_id(0, 1): [BoundingBox(0.01, 0.01, 0.01, 0.01, 0)] } box = get_symbol_bounding_box(s, symbol_id(), token_locations) assert box == BoundingBox(0.01, 0.01, 0.01, 0.01, 0)
def test_merge_contiguous_symbols(): result = parse_element(load_fragment_tag("relu.xml")) assert str(result.element) == "<mi>ReLU</mi>" symbol = result.symbols[0] assert str(symbol.element) == "<mi>ReLU</mi>" assert symbol.children == [] assert symbol.tokens == [ Token(0, 4, "ReLU", 0), ]
def test_merge_bounding_boxes(): s = symbol(tokens=[Token("x", "atom", 0, 1), Token("y", "atom", 2, 3)]) token_locations = { token_id(0, 1): [ BoundingBox(0.01, 0.01, 0.01, 0.01, 0), # Expand the bounding box downward .01 of the page BoundingBox(0.01, 0.02, 0.01, 0.01, 0), ], # Expand the bounding box rightward 10 pixels token_id(2, 3): [BoundingBox(0.02, 0.01, 0.01, 0.01, 0)], # Ignore this bounding box for an irrelevant token token_id(4, 5): [BoundingBox(0.03, 0.01, 0.01, 0.01, 0)], } box = get_symbol_bounding_box(s, symbol_id(), token_locations) assert box.left == 0.01 assert box.top == 0.01 assert abs(box.width - 0.02) < 0.0001 assert abs(box.height - 0.02) < 0.0001
def _extract_tokens(element: Tag) -> List[Token]: """ Get the tokens defined in this element. Tokens are only found in low-level elements like "<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely group other low-level elements (like "<mrow>" and "<msub>"). """ tokens = [] if element.name in TOKEN_TAGS and _has_s2_token_annotations(element): tokens.append( Token( text=element.string, token_index=int(element["s2:index"]), start=int(element["s2:start"]), end=int(element["s2:end"]), )) return tokens
def _extract_tokens(element: Tag) -> List[Token]: """ Get the tokens defined in this element. Tokens are only found in low-level elements like "<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely group other low-level elements (like "<mrow>" and "<msub>"). """ tokens = [] if _is_token(element): tokens.append( Token( # Convert text to a primitive type. 'element.string' is a NavigableString, # which causes recursion errors when serialized. text=str(element.string), token_index=int(element["s2:index"]), start=int(element["s2:start"]), end=int(element["s2:end"]), )) return tokens
def test_d_is_symbol_on_its_own(): result = parse_element(load_fragment_tag("d.xml")) assert len(result.symbols) == 1 assert str(result.symbols[0].element) == "<mi>d</mi>" assert result.symbols[0].tokens == [Token(0, 1, "d", 0)]
def test_number_is_not_a_symbol(): result = parse_element(load_fragment_tag("1.xml")) assert str(result.element) == "<mn>1</mn>" assert result.symbols == [] assert result.tokens == [Token(0, 1, "1", 0)]
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]: tokens_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id) tokens_path = os.path.join(tokens_dir, "entities.csv") symbols_dir = directories.arxiv_subdir("detected-symbols", arxiv_id) symbols_path = os.path.join(symbols_dir, "entities.csv") symbol_tokens_path = os.path.join(symbols_dir, "symbol_tokens.csv") symbol_children_path = os.path.join(symbols_dir, "symbol_children.csv") file_not_found = False if not os.path.exists(tokens_path): logging.info("Tokens data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbols_path): logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_tokens_path): logging.info("Symbol tokens data missing for paper %s. Skipping.", arxiv_id) file_not_found = True if not os.path.exists(symbol_children_path): logging.info("No symbol children data found for paper %s.", arxiv_id) file_not_found = True if file_not_found: return None loaded_tokens = load_from_csv(tokens_path, SerializableToken) loaded_symbols = load_from_csv(symbols_path, SerializableSymbol) loaded_symbol_tokens = load_from_csv(symbol_tokens_path, SerializableSymbolToken) loaded_symbol_children = load_from_csv(symbol_children_path, SerializableChild) tokens_by_id: Dict[TokenId, Token] = {} for t in loaded_tokens: token_id = TokenId(t.tex_path, t.equation_index, t.relative_start, t.relative_end) tokens_by_id[token_id] = Token(start=t.relative_start, end=t.relative_end, text=t.text, type_=t.type_) symbols_by_id: Dict[SymbolId, Symbol] = {} for s in loaded_symbols: symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index) symbols_by_id[symbol_id] = Symbol( tex_path=s.tex_path, equation_index=s.equation_index, symbol_index=s.symbol_index, tokens=[], start=s.start, end=s.end, tex=s.tex, mathml=s.mathml, children=[], parent=None, is_definition=s.is_definition, equation=s.equation, relative_start=s.relative_start, relative_end=s.relative_end, contains_affix=s.contains_affix, ) for st in loaded_symbol_tokens: symbol_id = SymbolId(st.tex_path, st.equation_index, st.symbol_index) token_id = TokenId(st.tex_path, st.equation_index, st.start, st.end) if token_id in tokens_by_id and symbol_id in symbols_by_id: symbols_by_id[symbol_id].tokens.append(tokens_by_id[token_id]) for c in loaded_symbol_children: parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index) child_id = SymbolId(c.tex_path, c.equation_index, c.child_index) try: child_symbol = symbols_by_id[child_id] parent_symbol = symbols_by_id[parent_id] parent_symbol.children.append(child_symbol) child_symbol.parent = parent_symbol except KeyError: logging.warning( # pylint: disable=logging-not-lazy "Could not load child symbol %s or parent symbol %s for paper %s when associating " + "the two as parent and child. There may have been an error in the equation " + "parser, like a failure to find tokens for the child symbol.", child_id, parent_id, arxiv_id, ) return [ SymbolWithId(symbol_id, symbol) for symbol_id, symbol in symbols_by_id.items() ]