Beispiel #1
0
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]:

    data_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id)
    symbols_path = os.path.join(data_dir, "symbols.csv")
    symbol_tokens_path = os.path.join(data_dir, "symbol_tokens.csv")
    symbol_children_path = os.path.join(data_dir, "symbol_children.csv")

    file_not_found = False
    if not os.path.exists(symbols_path):
        logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_tokens_path):
        logging.info("Symbol tokens data missing for paper %s. Skipping.",
                     arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_children_path):
        logging.info("No symbol children data found for paper %s.", arxiv_id)
        file_not_found = True

    if file_not_found:
        return None

    loaded_symbols = load_from_csv(symbols_path, SerializableSymbol)
    loaded_symbol_tokens = load_from_csv(symbol_tokens_path,
                                         SerializableSymbolToken)
    loaded_symbol_children = load_from_csv(symbol_children_path,
                                           SerializableChild)

    symbols_by_id: Dict[SymbolId, Symbol] = {}
    for s in loaded_symbols:
        symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index)
        symbols_by_id[symbol_id] = Symbol(tokens=[],
                                          start=s.start,
                                          end=s.end,
                                          tex=s.tex,
                                          mathml=s.mathml,
                                          children=[])

    for t in loaded_symbol_tokens:
        symbol_id = SymbolId(t.tex_path, t.equation_index, t.symbol_index)
        symbols_by_id[symbol_id].tokens.append(t.token_index)

    for c in loaded_symbol_children:
        parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index)
        child_id = SymbolId(c.tex_path, c.equation_index, c.child_index)
        child_symbol = symbols_by_id[child_id]
        symbols_by_id[parent_id].children.append(child_symbol)

    return [
        SymbolWithId(symbol_id, symbol)
        for symbol_id, symbol in symbols_by_id.items()
    ]
Beispiel #2
0
def test_annotate_one_symbol():
    tex = "some text... $x$"
    symbols = {SymbolId(TEX_PATH, 0, 0): Symbol([0], ARBITRARY_MATHML, [])}
    characters = {CharacterId(TEX_PATH, 0, 0): Character("x", 0, 0, 1)}
    annotated_tex, _ = annotate_symbols_and_equations_for_file(
        tex, TEX_PATH, symbols, characters)
    assert (annotated_tex ==
            "some text... <<equation>>$<<symbol>>x<</symbol>>$<</equation>>")
Beispiel #3
0
def test_annotate_nested_symbols():
    tex = "$x_i$"

    x = Symbol([0], ARBITRARY_MATHML, [])
    i = Symbol([1], ARBITRARY_MATHML, [])
    x_i = Symbol([0, 1], ARBITRARY_MATHML, children=[x, i])

    symbols = {
        SymbolId(TEX_PATH, 0, 0): x_i,
        SymbolId(TEX_PATH, 0, 1): x,
        SymbolId(TEX_PATH, 0, 2): i,
    }
    characters = {
        CharacterId(TEX_PATH, 0, 0): Character("x", 0, 0, 1),
        CharacterId(TEX_PATH, 0, 1): Character("i", 1, 2, 3),
    }
    annotated_tex, _ = annotate_symbols_and_equations_for_file(
        tex, TEX_PATH, symbols, characters)
    assert annotated_tex == ("<<equation>>$" + "<<symbol>>" +
                             "<<symbol>>x<</symbol>>" + "_" +
                             "<<symbol>>i<</symbol>>" + "<</symbol>>" +
                             "$<</equation>>")
def symbol_id():
    return SymbolId("tex-path", 0, 0)
Beispiel #5
0
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]:

    data_dir = directories.arxiv_subdir("detected-symbols", arxiv_id)
    symbols_path = os.path.join(data_dir, "entities.csv")
    symbol_tokens_path = os.path.join(data_dir, "symbol_tokens.csv")
    symbol_children_path = os.path.join(data_dir, "symbol_children.csv")

    file_not_found = False
    if not os.path.exists(symbols_path):
        logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_tokens_path):
        logging.info("Symbol tokens data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_children_path):
        logging.info("No symbol children data found for paper %s.", arxiv_id)
        file_not_found = True

    if file_not_found:
        return None

    loaded_symbols = load_from_csv(symbols_path, SerializableSymbol)
    loaded_symbol_tokens = load_from_csv(symbol_tokens_path, SerializableSymbolToken)
    loaded_symbol_children = load_from_csv(symbol_children_path, SerializableChild)

    symbols_by_id: Dict[SymbolId, Symbol] = {}
    for s in loaded_symbols:
        symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index)
        symbols_by_id[symbol_id] = Symbol(
            tokens=[],
            start=s.start,
            end=s.end,
            tex=s.tex,
            mathml=s.mathml,
            children=[],
            is_definition=s.is_definition,
            equation=s.equation,
            relative_start=s.relative_start,
            relative_end=s.relative_end
        )

    for t in loaded_symbol_tokens:
        symbol_id = SymbolId(t.tex_path, t.equation_index, t.symbol_index)
        symbols_by_id[symbol_id].tokens.append(t.token_index)

    for c in loaded_symbol_children:
        parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index)
        child_id = SymbolId(c.tex_path, c.equation_index, c.child_index)
        try:
            child_symbol = symbols_by_id[child_id]
            symbols_by_id[parent_id].children.append(child_symbol)
        except KeyError:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not load child symbol %s for symbol %s for paper %s. "
                + "There may have been an error in the equation parser, like a failure to "
                + "find tokens for the child symbol.",
                child_id,
                parent_id,
                arxiv_id,
            )

    return [
        SymbolWithId(symbol_id, symbol) for symbol_id, symbol in symbols_by_id.items()
    ]
Beispiel #6
0
def load_symbols(arxiv_id: ArxivId) -> Optional[List[SymbolWithId]]:

    tokens_dir = directories.arxiv_subdir("detected-equation-tokens", arxiv_id)
    tokens_path = os.path.join(tokens_dir, "entities.csv")

    symbols_dir = directories.arxiv_subdir("detected-symbols", arxiv_id)
    symbols_path = os.path.join(symbols_dir, "entities.csv")
    symbol_tokens_path = os.path.join(symbols_dir, "symbol_tokens.csv")
    symbol_children_path = os.path.join(symbols_dir, "symbol_children.csv")

    file_not_found = False
    if not os.path.exists(tokens_path):
        logging.info("Tokens data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbols_path):
        logging.info("Symbols data missing for paper %s. Skipping.", arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_tokens_path):
        logging.info("Symbol tokens data missing for paper %s. Skipping.",
                     arxiv_id)
        file_not_found = True
    if not os.path.exists(symbol_children_path):
        logging.info("No symbol children data found for paper %s.", arxiv_id)
        file_not_found = True

    if file_not_found:
        return None

    loaded_tokens = load_from_csv(tokens_path, SerializableToken)
    loaded_symbols = load_from_csv(symbols_path, SerializableSymbol)
    loaded_symbol_tokens = load_from_csv(symbol_tokens_path,
                                         SerializableSymbolToken)
    loaded_symbol_children = load_from_csv(symbol_children_path,
                                           SerializableChild)

    tokens_by_id: Dict[TokenId, Token] = {}
    for t in loaded_tokens:
        token_id = TokenId(t.tex_path, t.equation_index, t.relative_start,
                           t.relative_end)
        tokens_by_id[token_id] = Token(start=t.relative_start,
                                       end=t.relative_end,
                                       text=t.text,
                                       type_=t.type_)

    symbols_by_id: Dict[SymbolId, Symbol] = {}
    for s in loaded_symbols:
        symbol_id = SymbolId(s.tex_path, s.equation_index, s.symbol_index)
        symbols_by_id[symbol_id] = Symbol(
            tex_path=s.tex_path,
            equation_index=s.equation_index,
            symbol_index=s.symbol_index,
            tokens=[],
            start=s.start,
            end=s.end,
            tex=s.tex,
            mathml=s.mathml,
            children=[],
            parent=None,
            is_definition=s.is_definition,
            equation=s.equation,
            relative_start=s.relative_start,
            relative_end=s.relative_end,
            contains_affix=s.contains_affix,
        )

    for st in loaded_symbol_tokens:
        symbol_id = SymbolId(st.tex_path, st.equation_index, st.symbol_index)
        token_id = TokenId(st.tex_path, st.equation_index, st.start, st.end)
        if token_id in tokens_by_id and symbol_id in symbols_by_id:
            symbols_by_id[symbol_id].tokens.append(tokens_by_id[token_id])

    for c in loaded_symbol_children:
        parent_id = SymbolId(c.tex_path, c.equation_index, c.symbol_index)
        child_id = SymbolId(c.tex_path, c.equation_index, c.child_index)
        try:
            child_symbol = symbols_by_id[child_id]
            parent_symbol = symbols_by_id[parent_id]
            parent_symbol.children.append(child_symbol)
            child_symbol.parent = parent_symbol
        except KeyError:
            logging.warning(  # pylint: disable=logging-not-lazy
                "Could not load child symbol %s or parent symbol %s for paper %s when associating "
                +
                "the two as parent and child. There may have been an error in the equation "
                +
                "parser, like a failure to find tokens for the child symbol.",
                child_id,
                parent_id,
                arxiv_id,
            )

    return [
        SymbolWithId(symbol_id, symbol)
        for symbol_id, symbol in symbols_by_id.items()
    ]
Beispiel #7
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_by_id = {s.symbol_id: s.symbol for s in symbols_with_ids}

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            context_data_missing = False
            contexts_path = os.path.join(
                directories.arxiv_subdir("contexts-for-symbols", arxiv_id),
                "contexts.csv",
            )
            if not os.path.exists(contexts_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Contexts have not been found for symbols for arXiv paper %s. "
                    + "Symbol data will be uploaded without contexts.",
                    arxiv_id,
                )
                context_data_missing = True

            symbol_contexts = {}
            mathml_contexts = defaultdict(list)
            if not context_data_missing:
                for context in file_utils.load_from_csv(
                        contexts_path, Context):
                    tex_path = context.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in context.entity_id.split("-")
                    ]
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_contexts[symbol_id] = context
                    symbol = symbols_by_id[symbol_id]
                    mathml_contexts[symbol.mathml].append(context)

            symbol_formulas = {}
            mathml_formulas = defaultdict(set)
            for id_, symbol in symbols_by_id.items():
                if (symbol.is_definition and symbol.equation is not None
                        and symbol.relative_start is not None
                        and symbol.relative_end is not None):
                    highlighted = wrap_span(
                        symbol.equation,
                        symbol.relative_start,
                        symbol.relative_end,
                        before=r"\htmlClass{match-highlight}{",
                        after="}",
                        braces=True,
                    )
                    formula = DefiningFormula(
                        tex=highlighted,
                        tex_path=id_.tex_path,
                        equation_id=id_.equation_index,
                    )
                    symbol_formulas[id_] = formula
                    mathml_formulas[symbol.mathml].add(formula)

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_contexts,
                symbol_formulas,
                mathml_contexts,
                mathml_formulas,
                matches,
            )
Beispiel #8
0
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in self.arxiv_ids:

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = file_utils.load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            boxes: Dict[SymbolId, BoundingBox] = {}
            boxes_path = os.path.join(
                directories.arxiv_subdir("symbol-locations", arxiv_id),
                "symbol_locations.csv",
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for location in file_utils.load_from_csv(boxes_path,
                                                     SymbolLocation):
                symbol_id = SymbolId(
                    tex_path=location.tex_path,
                    equation_index=location.equation_index,
                    symbol_index=location.symbol_index,
                )
                box = BoundingBox(
                    page=int(location.page),
                    left=location.left,
                    top=location.top,
                    width=location.width,
                    height=location.height,
                )
                boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.arxiv_subdir("symbol-matches", arxiv_id),
                "matches.csv")
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            for match in file_utils.load_from_csv(matches_path, Match):
                if match.queried_mathml not in matches:
                    matches[match.queried_mathml] = []
                matches[match.queried_mathml].append(match)

            sentence_data_missing = False
            sentences_path = os.path.join(
                directories.arxiv_subdir("sentences-for-symbols", arxiv_id),
                "entity_sentences.csv",
            )
            if not os.path.exists(sentences_path):
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Symbols for arXiv paper %s have not been aligned to sentences. "
                    +
                    "Symbol data will be uploaded without links to sentences",
                    arxiv_id,
                )
                sentence_data_missing = True

            if not sentence_data_missing:
                symbol_sentences = {}
                for pair in file_utils.load_from_csv(sentences_path,
                                                     EntitySentencePairIds):
                    tex_path = pair.tex_path
                    equation_index, symbol_index = [
                        int(t) for t in pair.entity_id.split("-")
                    ]
                    sentence_key = SentenceKey(pair.tex_path, pair.sentence_id)
                    symbol_id = SymbolId(tex_path, equation_index,
                                         symbol_index)
                    symbol_sentences[symbol_id] = sentence_key

            yield SymbolData(
                arxiv_id,
                s2_id,
                symbols_with_ids,
                boxes,
                symbol_sentences,
                matches,
            )