Beispiel #1
0
    def save(self, item: CitationData, _: None) -> None:
        citation_locations = item.citation_locations
        key_s2_ids = item.key_s2_ids

        entity_infos = []

        citation_index = 0
        for citation_key, locations in citation_locations.items():

            if citation_key not in key_s2_ids:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Not uploading bounding box information for citation with key "
                    + "%s because it was not resolved to a paper S2 ID.",
                    citation_key,
                )
                continue

            for cluster_index, location_set in locations.items():
                boxes = cast(List[BoundingBox], list(location_set))
                entity_info = EntityUploadInfo(
                    id_=f"{citation_key}-{cluster_index}",
                    type_="citation",
                    bounding_boxes=boxes,
                    data={"key": citation_key, "paper_id": key_s2_ids[citation_key]},
                )
                entity_infos.append(entity_info)
                citation_index += 1

        upload_entities(item.s2_id, item.arxiv_id, entity_infos, self.args.data_version)
Beispiel #2
0
def upload_sentences(processing_summary: PaperProcessingResult,
                     data_version: Optional[int]) -> None:

    entity_infos = []
    for entity_summary in processing_summary.entities:
        sentence = cast(SentenceEntity, entity_summary.entity)
        boxes = [cast(BoundingBox, l) for l in entity_summary.locations]

        entity_info = EntityUploadInfo(
            id_=f"{sentence.tex_path}-{sentence.id_}",
            type_="sentence",
            bounding_boxes=boxes,
            data={
                "text": sentence.text,
                "tex": sentence.tex,
                "tex_start": sentence.start,
                "tex_end": sentence.end,
            },
        )
        entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Beispiel #3
0
def upload_equations(processing_summary: PaperProcessingResult,
                     data_version: Optional[int]) -> None:

    entity_infos = []
    for entity_summary in processing_summary.entities:
        equation = cast(Equation, entity_summary.entity)
        boxes = [cast(BoundingBox, l) for l in entity_summary.locations]

        entity_info = EntityUploadInfo(
            id_=f"{equation.tex_path}-{equation.id_}",
            type_="equation",
            bounding_boxes=boxes,
            data={"tex": equation.tex},
        )
        entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Beispiel #4
0
def upload_symbols(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:

    arxiv_id = processing_summary.arxiv_id
    entities = [es.entity for es in processing_summary.entities]
    symbols = cast(List[SerializableSymbol], entities)
    symbols_by_id = {sid(s): s for s in symbols}

    entity_infos: List[EntityUploadInfo] = []

    # Load MathML matches for partially matching of symbols.
    matches: Matches = {}
    matches_path = os.path.join(
        directories.arxiv_subdir("symbol-matches", processing_summary.arxiv_id),
        "matches.csv",
    )
    if os.path.exists(matches_path):
        for match in file_utils.load_from_csv(matches_path, Match):
            if match.queried_mathml not in matches:
                matches[match.queried_mathml] = []
            matches[match.queried_mathml].append(match)
    else:
        logging.warning(
            "Could not find symbol matches information for paper %s.", arxiv_id,
        )

    # Load parent-child relationships for symbols.
    children: Dict[SymbolId, List[SymbolId]] = defaultdict(list)
    parents: Dict[SymbolId, SymbolId] = {}
    children_path = os.path.join(
        directories.arxiv_subdir("detected-symbols", arxiv_id), "symbol_children.csv"
    )
    if os.path.exists(children_path):
        for parent in file_utils.load_from_csv(children_path, SerializableChild):
            pid = f"{parent.tex_path}-{parent.equation_index}-{parent.symbol_index}"
            cid = f"{parent.tex_path}-{parent.equation_index}-{parent.child_index}"
            parents[cid] = pid
            children[pid].append(cid)
    else:
        logging.warning(
            "Could not find file mapping from symbol to their children for paper %s.",
            arxiv_id,
        )

    # Load contexts that the symbols appear in. Sort them by the symbol MathML.
    context_data_missing = False
    contexts_path = os.path.join(
        directories.arxiv_subdir("contexts-for-symbols", arxiv_id), "contexts.csv",
    )
    if not os.path.exists(contexts_path):
        logging.warning(  # pylint: disable=logging-not-lazy
            "Contexts have not been found for symbols for arXiv paper %s. "
            + "Symbol data will be uploaded without contexts.",
            arxiv_id,
        )
        context_data_missing = True

    symbol_contexts = {}
    mathml_contexts = defaultdict(list)
    if not context_data_missing:
        for context in file_utils.load_from_csv(contexts_path, Context):
            tex_path = context.tex_path
            symbol_id = f"{tex_path}-{context.entity_id}"
            symbol_contexts[symbol_id] = context
            symbol = symbols_by_id[symbol_id]
            mathml_contexts[symbol.mathml].append(context)

    # Prepare collections of formulae that each symbol was found in.
    symbol_formulas = {}
    mathml_formulas: Dict[str, Set[DefiningFormula]] = defaultdict(set)
    for symbol in symbols:
        if (
            symbol.is_definition
            and symbol.equation is not None
            and symbol.relative_start is not None
            and symbol.relative_end is not None
        ):
            highlighted = wrap_span(
                symbol.equation,
                symbol.relative_start,
                symbol.relative_end,
                before=r"\htmlClass{match-highlight}{",
                after="}",
                braces=True,
            )
            formula = DefiningFormula(
                tex=highlighted,
                tex_path=symbol.tex_path,
                equation_id=str(symbol.equation_index),
            )
            symbol_formulas[sid(symbol)] = formula
            mathml_formulas[symbol.mathml].add(formula)

    entity_infos = []
    for localized_entity in processing_summary.entities:

        symbol = cast(SerializableSymbol, localized_entity.entity)
        boxes = [
            BoundingBox(l.left, l.top, l.width, l.height, l.page)
            for l in localized_entity.locations
        ]

        # Get context and formula of the symbol, and other matching ones.
        symbol_context = symbol_contexts.get(sid(symbol))
        matching_contexts = mathml_contexts.get(symbol.mathml, [])
        other_context_texs = []
        other_context_sentence_ids = []
        for c in matching_contexts:
            matching_sentence_id = f"{c.tex_path}-{c.sentence_id}"
            if matching_sentence_id not in other_context_sentence_ids:
                other_context_texs.append(c.snippet)
                other_context_sentence_ids.append(matching_sentence_id)

        matching_formulas = mathml_formulas.get(symbol.mathml, set())
        other_formula_texs = []
        other_formula_ids = []
        for f in matching_formulas:
            equation_id = f"{f.tex_path}-{f.equation_id}"
            if equation_id not in other_formula_ids:
                other_formula_texs.append(f.tex)
                other_formula_ids.append(equation_id)

        # Package up data for the symbol.
        tags: List[str] = []
        MAX_BOX_HEIGHT = 0.1
        for b in boxes:
            if b.height > MAX_BOX_HEIGHT:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Detected large bounding box for symbol with height %f for entity %s of paper "
                    + "%s. Entity will be given a tag indicating it is unexpectedly large.",
                    b.height,
                    f"{localized_entity.entity.tex_path}-{localized_entity.entity.id_}",
                    arxiv_id,
                )
                tags.append("large")
                break

        data: EntityData = {
            "tex": f"${symbol.tex}$",
            "tex_start": symbol.start,
            "tex_end": symbol.end,
            "type": symbol.type_,
            "mathml": symbol.mathml,
            "mathml_near_matches": [m.matching_mathml for m in matches[symbol.mathml]],
            "snippets": other_context_texs,
            "defining_formulas": other_formula_texs,
            "is_definition": symbol.is_definition or False,
            "tags": tags,
        }

        # Create links between this symbol, its sentence, and related symbols.
        sentence_id = (
            f"{symbol_context.tex_path}-{symbol_context.sentence_id}"
            if symbol_context is not None
            else None
        )

        parent_id = parents.get(sid(symbol))
        child_ids = children.get(sid(symbol), [])

        relationships: EntityRelationships = {
            "equation": EntityReference(
                type_="equation", id_=f"{symbol.tex_path}-{symbol.equation_index}",
            ),
            "parent": EntityReference(type_="symbol", id_=parent_id),
            "children": [EntityReference(type_="symbol", id_=id_) for id_ in child_ids],
            "sentence": EntityReference(type_="sentence", id_=sentence_id)
            if sentence_id is not None
            else EntityReference(type_="sentence", id_=None),
            "defining_formula_equations": [
                EntityReference(type_="equation", id_=id_) for id_ in other_formula_ids
            ],
            "snippet_sentences": [
                EntityReference(type_="sentence", id_=id_)
                for id_ in other_context_sentence_ids
            ],
        }

        # Save all data for this symbol
        entity_information = EntityUploadInfo(
            id_=sid(symbol),
            type_="symbol",
            bounding_boxes=boxes,
            data=data,
            relationships=relationships,
        )
        entity_infos.append(entity_information)

    upload_entities(
        processing_summary.s2_id, arxiv_id, entity_infos, data_version,
    )
Beispiel #5
0
def upload_terms(processing_summary: PaperProcessingResult,
                 data_version: Optional[int]) -> None:

    arxiv_id = processing_summary.arxiv_id
    contexts = file_utils.load_from_csv(
        os.path.join(
            directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id),
            "contexts.csv",
        ),
        Context,
    )
    contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts}

    # Assemble contexts that should be shown for each term.
    contexts_by_term: Dict[str, List[Context]] = defaultdict(list)
    for entity_summary in processing_summary.entities:
        term = cast(Term, entity_summary.entity)
        if (term.tex_path, term.id_) in contexts_by_entity:
            contexts_by_term[term.text].append(
                contexts_by_entity[(term.tex_path, term.id_)])

    entity_infos = []
    for entity_summary in processing_summary.entities:
        term = cast(Term, entity_summary.entity)
        context = contexts_by_entity.get((term.tex_path, term.id_))
        boxes = [cast(BoundingBox, l) for l in entity_summary.locations]

        # Cluster bounding boxes, in case any of these terms are defined as a macro (in which)
        # case all appearances of that term on the same page will have been lumped together.
        clusters = cluster_boxes(boxes, vertical_split=0.005)
        for i, cluster in enumerate(clusters):
            entity_info = EntityUploadInfo(
                id_=f"{term.tex_path}-{term.id_}-{i}",
                type_="term",
                bounding_boxes=list(cluster),
                data={
                    "name":
                    term.text,
                    "definitions":
                    term.definitions,
                    "definition_texs":
                    term.definitions,
                    "sources":
                    term.sources,
                    "snippets":
                    [c.snippet for c in contexts_by_term.get(term.text, [])],
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{context.tex_path}-{context.sentence_id}"
                        if context is not None else None,
                    ),
                    "snippet_sentences": [
                        EntityReference(type_="sentence",
                                        id_=f"{c.tex_path}-{c.sentence_id}")
                        for c in contexts_by_term.get(term.text, [])
                    ],
                },
            )
            entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Beispiel #6
0
def upload_term_definitions(
    processing_summary: PaperProcessingResult, data_version: Optional[int]
) -> None:
    " Upload textual terms and their definitions. "

    # Group contextual snippets for each term.
    term_infos = []
    contexts_by_term_name: Dict[TermName, List[Context]] = defaultdict(list)
    for entity_summary in processing_summary.entities:
        entity = entity_summary.entity
        context = entity_summary.context
        if is_textual_term(entity) and context is not None:
            contexts_by_term_name[entity.text].append(context)  # type: ignore

    # Construct mapping from definitions to the sentences that contain them.
    contexts_by_definition: Dict[EntityId, Context] = {}
    for entity_summary in processing_summary.entities:
        entity_id = entity_summary.entity.id_
        context = entity_summary.context
        if (entity_id.startswith("definition")) and context is not None:
            contexts_by_definition[entity_id] = context

    # Upload information for each term.
    for entity_summary in processing_summary.entities:
        boxes = [cast(BoundingBox, l) for l in entity_summary.locations]
        entity = entity_summary.entity
        context = entity_summary.context

        if not is_textual_term(entity):
            continue

        term = cast(TermReference, entity)

        # Assemble list of snippets that include this term.
        contexts_matching_term = contexts_by_term_name.get(term.text, [])
        snippets = [c.snippet for c in contexts_matching_term]
        snippet_sentences = [
            f"{c.tex_path}-{c.sentence_id}" for c in contexts_matching_term
        ]

        # Create links to the sentences containing definitions for this term.
        definition_sentences: List[Optional[str]] = []
        for definition_id in term.definition_ids:
            if definition_id not in contexts_by_definition:
                definition_sentences.append(None)
            definition_context = contexts_by_definition[definition_id]
            definition_sentences.append(
                f"{definition_context.tex_path}-{definition_context.sentence_id}"
            )

        term_info = EntityUploadInfo(
            id_=term.id_,
            type_="term",
            bounding_boxes=boxes,
            data={
                "name": term.text,
                "term_type": term.type_ or "unknown",
                "definitions": term.definitions,
                "definition_texs": term.definition_texs,
                "sources": term.sources,
                # A list of all other sentences the term appearse elsewhere in the paper.
                "snippets": snippets,
            },
            relationships={
                # Link the term to the sentence it belongs to. This link is necessary to enable
                # visual filtering in the UI where, when a term is clicked, the sentence is
                # highlighted and all others are lowlighted.
                "sentence": EntityReference(
                    type_="sentence",
                    id_=f"{context.tex_path}-{context.sentence_id}"
                    if context is not None
                    else None,
                ),
                # IDs of the sentences that contain each of the definitions for a term. These IDs
                # can be used to establish links that take a user to the site of a definition.
                "definition_sentences": [
                    EntityReference(type_="sentence", id_=id_)
                    for id_ in definition_sentences
                ],
                # The IDs of each sentence where the term appears elsewhere in the paper (i.e.,
                # for each of the 'snippets' in the entity data above. Used to link from a snippet
                # that is shown in a list of snippets to where that snippet appears in the paper.
                "snippet_sentences": [
                    EntityReference(type_="sentence", id_=id_)
                    for id_ in snippet_sentences
                ],
            },
        )
        term_infos.append(term_info)

    upload_entities(
        processing_summary.s2_id, processing_summary.arxiv_id, term_infos, data_version,
    )