def test_parse_prime():
    result = parse_element(load_fragment_tag("x_prime.xml"))
    assert len(result.symbols) == 1
    symbol = result.symbols[0]
    assert len(symbol.children) == 1
    assert str(symbol.children[0].element) == "<mi>x</mi>"
    assert symbol.tokens == [Token(0, 1, "x", 0), Token(1, 2, "′", 2)]
Beispiel #2
0
def _extract_tokens(element: Tag) -> List[Token]:
    """
    Get the tokens defined in this element. Tokens are characters or spans of text that
    make up symbols. There should be a token returned for each glyph in a symbol that needs
    to be detected separately (e.g., a symbol's base and its subscript are different tokens).

    Tokens are only found in low-level elements like "<mi>" and "<mn>". This function will
    not find tokens in higher-level nodes that solely group other low-level elements (like
    "<mrow>" and "<msub>").
    """

    tokens = []

    if _is_atomic_token(element):
        tokens.append(
            Token(
                # Convert text to a primitive type. 'element.string' is a NavigableString,
                # which causes recursion errors when serialized.
                text=str(element.string),
                start=int(element["s2:start"]),
                end=int(element["s2:end"]),
                type_="atom",
            )
        )
    elif _is_affix_token(element):
        tokens.append(
            Token(
                text=str(element.string),
                start=int(element["s2:start"]),
                end=int(element["s2:end"]),
                type_="affix",
            )
        )

    return tokens
def test_parse_accent():
    result = parse_element(load_fragment_tag("bar_x.xml"))
    assert len(result.symbols) == 1
    assert str(result.element) == '<mover accent="true"><mi>x</mi><mo>ˉ</mo></mover>'
    symbol = result.symbols[0]
    assert symbol.contains_affix_token
    assert symbol.tokens == [Token("x", "atom", 5, 6), Token("ˉ", "affix", 0, 5)]
def test_detect_function_declaration():
    result = parse_element(load_fragment_tag("function.xml"))
    symbol = result.symbols[0]

    assert symbol.element.text == "p(x;θ,y)"
    assert symbol.type_ == NodeType.FUNCTION
    assert symbol.start == 0
    assert symbol.end == 16
    assert (
        Token("(", "atom", 1, 2) in symbol.tokens
    ), "function tokens should include parentheses"
    assert (
        Token(")", "atom", 15, 16) in symbol.tokens
    ), "function tokens should include parentheses"
    assert not any(
        [t.text == "," for t in symbol.tokens]
    ), "function tokens should not include commas"
    assert not any(
        [t.text == ";" for t in symbol.tokens]
    ), "function tokens should not include semicolons"

    child_symbols = symbol.child_symbols
    assert len(child_symbols) == 4
    assert str(child_symbols[0].element) == "<mi>p</mi>"
    assert str(child_symbols[1].element) == "<mi>x</mi>"
    assert str(child_symbols[2].element) == "<mi>θ</mi>"
    assert str(child_symbols[3].element) == "<mi>y</mi>"
Beispiel #5
0
def test_parse_single_symbol():
    result = parse_element(load_fragment_tag("x.xml"))
    assert len(result.symbols) == 1
    symbol = result.symbols[0]
    assert str(symbol.element) == "<mi>x</mi>"
    assert symbol.children == []
    assert symbol.tokens == [Token(0, 1, "x", 0)]
    assert result.tokens == [Token(0, 1, "x", 0)]
def test_parse_node_with_child_nodes():
    result = parse_element(load_fragment_tag("x_sub_i.xml"))
    symbol = result.symbols[0]
    assert str(symbol.element) == "<msub><mi>x</mi><mi>i</mi></msub>"
    assert len(symbol.children) == 2
    assert str(symbol.children[0].element) == "<mi>x</mi>"
    assert str(symbol.children[1].element) == "<mi>i</mi>"
    assert symbol.tokens == [
        Token(0, 1, "x", 0),
        Token(2, 3, "i", 1),
    ]
def test_parse_single_symbol():
    result = parse_element(load_fragment_tag("x.xml"))
    assert len(result.symbols) == 1
    symbol = result.symbols[0]
    assert str(symbol.element) == "<mi>x</mi>"
    assert symbol.type_ == NodeType.IDENTIFIER
    assert symbol.children == []
    assert symbol.tokens == [Token("x", "atom", 0, 1)]
    assert symbol.start == 0
    assert symbol.end == 1
    assert not symbol.defined
    assert not symbol.contains_affix_token
    assert result.tokens == [Token("x", "atom", 0, 1)]
def test_get_token_bounding_box():
    s = symbol(tokens=[Token("x", "atom", 0, 1)])
    token_locations = {
        token_id(0, 1): [BoundingBox(0.01, 0.01, 0.01, 0.01, 0)]
    }
    box = get_symbol_bounding_box(s, symbol_id(), token_locations)
    assert box == BoundingBox(0.01, 0.01, 0.01, 0.01, 0)
def test_merge_contiguous_symbols():
    result = parse_element(load_fragment_tag("relu.xml"))
    assert str(result.element) == "<mi>ReLU</mi>"
    symbol = result.symbols[0]
    assert str(symbol.element) == "<mi>ReLU</mi>"
    assert symbol.children == []
    assert symbol.tokens == [
        Token(0, 4, "ReLU", 0),
    ]
Beispiel #10
0
def test_merge_bounding_boxes():
    s = symbol(tokens=[Token("x", "atom", 0, 1), Token("y", "atom", 2, 3)])
    token_locations = {
        token_id(0, 1): [
            BoundingBox(0.01, 0.01, 0.01, 0.01, 0),
            # Expand the bounding box downward .01 of the page
            BoundingBox(0.01, 0.02, 0.01, 0.01, 0),
        ],
        # Expand the bounding box rightward 10 pixels
        token_id(2, 3): [BoundingBox(0.02, 0.01, 0.01, 0.01, 0)],
        # Ignore this bounding box for an irrelevant token
        token_id(4, 5): [BoundingBox(0.03, 0.01, 0.01, 0.01, 0)],
    }
    box = get_symbol_bounding_box(s, symbol_id(), token_locations)
    assert box.left == 0.01
    assert box.top == 0.01
    assert abs(box.width - 0.02) < 0.0001
    assert abs(box.height - 0.02) < 0.0001
def _extract_tokens(element: Tag) -> List[Token]:
    """
    Get the tokens defined in this element. Tokens are only found in low-level elements like
    "<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely
    group other low-level elements (like "<mrow>" and "<msub>").
    """

    tokens = []
    if element.name in TOKEN_TAGS and _has_s2_token_annotations(element):
        tokens.append(
            Token(
                text=element.string,
                token_index=int(element["s2:index"]),
                start=int(element["s2:start"]),
                end=int(element["s2:end"]),
            ))

    return tokens
Beispiel #12
0
def _extract_tokens(element: Tag) -> List[Token]:
    """
    Get the tokens defined in this element. Tokens are only found in low-level elements like
    "<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely
    group other low-level elements (like "<mrow>" and "<msub>").
    """

    tokens = []

    if _is_token(element):
        tokens.append(
            Token(
                # Convert text to a primitive type. 'element.string' is a NavigableString,
                # which causes recursion errors when serialized.
                text=str(element.string),
                token_index=int(element["s2:index"]),
                start=int(element["s2:start"]),
                end=int(element["s2:end"]),
            ))

    return tokens
Beispiel #13
0
def test_d_is_symbol_on_its_own():
    result = parse_element(load_fragment_tag("d.xml"))
    assert len(result.symbols) == 1
    assert str(result.symbols[0].element) == "<mi>d</mi>"
    assert result.symbols[0].tokens == [Token(0, 1, "d", 0)]
Beispiel #14
0
def test_number_is_not_a_symbol():
    result = parse_element(load_fragment_tag("1.xml"))
    assert str(result.element) == "<mn>1</mn>"
    assert result.symbols == []
    assert result.tokens == [Token(0, 1, "1", 0)]
Beispiel #15
0
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]:

    tokens_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id)
    tokens_path = os.path.join(tokens_dir, "entities.csv")

    symbols_dir = directories.arxiv_subdir("detected-symbols", arxiv_id)
    symbols_path = os.path.join(symbols_dir, "entities.csv")
    symbol_tokens_path = os.path.join(symbols_dir, "symbol_tokens.csv")
    symbol_children_path = os.path.join(symbols_dir, "symbol_children.csv")

    file_not_found = False
    if not os.path.exists(tokens_path):
        logging.info("Tokens data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbols_path):
        logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_tokens_path):
        logging.info("Symbol tokens data missing for paper %s. Skipping.",
                     arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_children_path):
        logging.info("No symbol children data found for paper %s.", arxiv_id)
        file_not_found = True

    if file_not_found:
        return None

    loaded_tokens = load_from_csv(tokens_path, SerializableToken)
    loaded_symbols = load_from_csv(symbols_path, SerializableSymbol)
    loaded_symbol_tokens = load_from_csv(symbol_tokens_path,
                                         SerializableSymbolToken)
    loaded_symbol_children = load_from_csv(symbol_children_path,
                                           SerializableChild)

    tokens_by_id: Dict[TokenId, Token] = {}
    for t in loaded_tokens:
        token_id = TokenId(t.tex_path, t.equation_index, t.relative_start,
                           t.relative_end)
        tokens_by_id[token_id] = Token(start=t.relative_start,
                                       end=t.relative_end,
                                       text=t.text,
                                       type_=t.type_)

    symbols_by_id: Dict[SymbolId, Symbol] = {}
    for s in loaded_symbols:
        symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index)
        symbols_by_id[symbol_id] = Symbol(
            tex_path=s.tex_path,
            equation_index=s.equation_index,
            symbol_index=s.symbol_index,
            tokens=[],
            start=s.start,
            end=s.end,
            tex=s.tex,
            mathml=s.mathml,
            children=[],
            parent=None,
            is_definition=s.is_definition,
            equation=s.equation,
            relative_start=s.relative_start,
            relative_end=s.relative_end,
            contains_affix=s.contains_affix,
        )

    for st in loaded_symbol_tokens:
        symbol_id = SymbolId(st.tex_path, st.equation_index, st.symbol_index)
        token_id = TokenId(st.tex_path, st.equation_index, st.start, st.end)
        if token_id in tokens_by_id and symbol_id in symbols_by_id:
            symbols_by_id[symbol_id].tokens.append(tokens_by_id[token_id])

    for c in loaded_symbol_children:
        parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index)
        child_id = SymbolId(c.tex_path, c.equation_index, c.child_index)
        try:
            child_symbol = symbols_by_id[child_id]
            parent_symbol = symbols_by_id[parent_id]
            parent_symbol.children.append(child_symbol)
            child_symbol.parent = parent_symbol
        except KeyError:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not load child symbol %s or parent symbol %s for paper %s when associating "
                +
                "the two as parent and child. There may have been an error in the equation "
                +
                "parser, like a failure to find tokens for the child symbol.",
                child_id,
                parent_id,
                arxiv_id,
            )

    return [
        SymbolWithId(symbol_id, symbol)
        for symbol_id, symbol in symbols_by_id.items()
    ]