Example #1
0
def upload_sentences(processing_summary: PaperProcessingResult,
                     data_version: Optional[int]) -> None:

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        sentence = cast(SentenceEntity, entity_and_location.entity)
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        entity_info = EntityInformation(
            id_=f"{sentence.tex_path}-{sentence.id_}",
            type_="sentence",
            bounding_boxes=boxes,
            data={
                "text": sentence.text,
                "tex": sentence.tex,
                "tex_start": sentence.start,
                "tex_end": sentence.end,
            },
        )
        entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Example #2
0
def upload_terms(processing_summary: PaperProcessingResult,
                 data_version: Optional[int]) -> None:

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        term = cast(TermEntity, entity_and_location.entity)
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        entity_info = EntityInformation(
            id_=f"{term.tex_path}-{term.id_}",
            type_="term",
            bounding_boxes=boxes,
            data={
                "name": term.name,
                "definitions": term.definitions,
                "sources": term.sources,
                "val": term.val,
            },
        )
        entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Example #3
0
    def save(self, item: CitationData, _: None) -> None:
        citation_locations = item.citation_locations
        key_s2_ids = item.key_s2_ids

        entity_infos = []

        citation_index = 0
        for citation_key, locations in citation_locations.items():

            if citation_key not in key_s2_ids:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Not uploading bounding box information for citation with key "
                    + "%s because it was not resolved to a paper S2 ID.",
                    citation_key,
                )
                continue

            for cluster_index, location_set in locations.items():
                boxes = cast(List[BoundingBox], list(location_set))
                entity_info = EntityInformation(
                    id_=f"{citation_key}-{cluster_index}",
                    type_="citation",
                    bounding_boxes=boxes,
                    data={
                        "key": citation_key,
                        "paper_id": key_s2_ids[citation_key]
                    },
                )
                entity_infos.append(entity_info)
                citation_index += 1

        upload_entities(item.s2_id, item.arxiv_id, entity_infos,
                        self.args.data_version)
Example #4
0
def upload_equations(processing_summary: PaperProcessingResult,
                     data_version: Optional[int]) -> None:

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        equation = cast(Equation, entity_and_location.entity)
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        entity_info = EntityInformation(
            id_=f"{equation.tex_path}-{equation.id_}",
            type_="equation",
            bounding_boxes=boxes,
            data={"tex": equation.tex},
        )
        entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Example #5
0
    def save(self, item: SymbolData, _: None) -> None:
        symbols_with_ids = item.symbols_with_ids
        boxes = item.boxes
        matches = item.matches
        symbol_contexts = item.symbol_contexts
        mathml_contexts = item.mathml_contexts
        symbol_formulas = item.symbol_formulas
        mathml_formulas = item.mathml_formulas

        symbol_ids_by_symbol_object_ids = {}
        for symbol_with_id in symbols_with_ids:
            symbol_ids_by_symbol_object_ids[id(
                symbol_with_id.symbol)] = symbol_with_id.symbol_id

        entity_infos = []

        for symbol_with_id in symbols_with_ids:
            symbol = symbol_with_id.symbol
            # TODO(andrewhead): move this filtering condition into 'parse_equation'
            if symbol.tex in ["$|$", "|"]:
                continue

            symbol_id = symbol_with_id.symbol_id

            # Get context and formula of the symbol, and other matching ones.
            context = symbol_contexts.get(symbol_id)
            matching_contexts = mathml_contexts.get(symbol.mathml, [])
            other_context_texs = []
            other_context_sentence_ids = []
            for c in matching_contexts:
                matching_sentence_id = f"{c.tex_path}-{c.sentence_id}"
                if (matching_sentence_id not in other_context_sentence_ids
                        # and c.sentence_id != context.sentence_id
                    ):
                    other_context_texs.append(c.snippet)
                    other_context_sentence_ids.append(matching_sentence_id)

            formula = symbol_formulas.get(symbol_id)
            matching_formulas = mathml_formulas.get(symbol.mathml, [])
            other_formula_texs = []
            other_formula_ids = []
            for f in matching_formulas:
                equation_id = f"{f.tex_path}-{f.equation_id}"
                if equation_id not in other_formula_ids:
                    # and (
                    #   :  formula is None or equation_id != formula.equation_id
                    # )
                    other_formula_texs.append(f.tex)
                    other_formula_ids.append(equation_id)

            box = boxes.get(symbol_id)
            if box is None:
                continue

            data: EntityData = {
                "tex":
                f"${symbol.tex}$",
                "tex_start":
                symbol.start,
                "tex_end":
                symbol.end,
                "mathml":
                symbol.mathml,
                "mathml_near_matches":
                [m.matching_mathml for m in matches[symbol.mathml]],
                # "snippet": context.snippet,
                "snippets":
                other_context_texs,
                "defining_formulas":
                other_formula_texs,
                "is_definition":
                symbol.is_definition or False,
            }
            # if formula is not None:
            #     data['formula'] = formula.tex

            create_symbol_id_string: Callable[[SymbolId], str] = (
                lambda sid:
                f"{sid.tex_path}-{sid.equation_index}-{sid.symbol_index}")

            sentence_id = (f"{context.tex_path}-{context.sentence_id}"
                           if context is not None else None)

            parent_id: Optional[str] = None
            for other_symbol_with_id in symbols_with_ids:
                other_symbol_id = other_symbol_with_id.symbol_id
                other_symbol = other_symbol_with_id.symbol
                try:
                    other_symbol.children.index(symbol)
                    parent_id = create_symbol_id_string(other_symbol_id)
                except ValueError:
                    continue

            child_ids = []
            for child_symbol in symbol.children:
                child_symbol_id = symbol_ids_by_symbol_object_ids[id(
                    child_symbol)]
                string_id = create_symbol_id_string(child_symbol_id)
                child_ids.append(string_id)

            relationships: EntityRelationships = {
                "equation":
                EntityReference(
                    type_="equation",
                    id_=f"{symbol_id.tex_path}-{symbol_id.equation_index}",
                ),
                "parent":
                EntityReference(type_="symbol", id_=parent_id),
                "children": [
                    EntityReference(type_="symbol", id_=id_)
                    for id_ in child_ids
                ],
                "sentence":
                EntityReference(type_="sentence", id_=sentence_id) if
                sentence_id is not None else EntityReference(type_="sentence",
                                                             id_=None),
                "defining_formula_equations": [
                    EntityReference(type_="equation", id_=id_)
                    for id_ in other_formula_ids
                ],
                "snippet_sentences": [
                    EntityReference(type_="sentence", id_=id_)
                    for id_ in other_context_sentence_ids
                ],
                # "snippet_sentence": EntityReference(
                #     type_="sentence", id_=f"{symbol_id.tex_path}-f{context.sentence_id}"
                # )
                # if context is not None
                # else None,
                # "formula_equation": EntityReference(
                #     type_="equation",
                #     id_=f"{symbol_id.tex_path}-f{formula.equation_id}"
                #     if formula is not None
                #     else None,
                # ),
            }

            entity_information = EntityInformation(
                id_=
                f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}",
                type_="symbol",
                bounding_boxes=[box],
                data=data,
                relationships=relationships,
            )
            entity_infos.append(entity_information)

        upload_entities(item.s2_id, item.arxiv_id, entity_infos,
                        self.args.data_version)
Example #6
0
def upload_terms(processing_summary: PaperProcessingResult,
                 data_version: Optional[int]) -> None:

    arxiv_id = processing_summary.arxiv_id
    contexts = file_utils.load_from_csv(
        os.path.join(
            directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id),
            "contexts.csv",
        ),
        Context,
    )
    contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts}

    # Assemble contexts that should be shown for each term.
    contexts_by_term: Dict[str, List[Context]] = defaultdict(list)
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        if (term.tex_path, term.id_) in contexts_by_entity:
            contexts_by_term[term.text].append(
                contexts_by_entity[(term.tex_path, term.id_)])

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        context = contexts_by_entity.get((term.tex_path, term.id_))
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        # Cluster bounding boxes, in case any of these terms are defined as a macro (in which)
        # case all appearances of that term on the same page will have been lumped together.
        clusters = cluster_boxes(boxes, vertical_split=0.005)
        for i, cluster in enumerate(clusters):
            entity_info = EntityInformation(
                id_=f"{term.tex_path}-{term.id_}-{i}",
                type_="term",
                bounding_boxes=list(cluster),
                data={
                    "name":
                    term.text,
                    "definitions":
                    term.definitions,
                    "definition_texs":
                    term.definitions,
                    "sources":
                    term.sources,
                    "snippets":
                    [c.snippet for c in contexts_by_term.get(term.text, [])],
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{context.tex_path}-{context.sentence_id}"
                        if context is not None else None,
                    ),
                    "snippet_sentences": [
                        EntityReference(type_="sentence",
                                        id_=f"{c.tex_path}-{c.sentence_id}")
                        for c in contexts_by_term.get(term.text, [])
                    ],
                },
            )
            entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )
Example #7
0
def upload_definitions(processing_summary: PaperProcessingResult,
                       data_version: Optional[int]) -> None:

    term_infos = []
    definition_infos = []
    for entity_and_location in processing_summary.localized_entities:
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]
        entity = entity_and_location.entity

        if entity.id_.startswith("definition"):
            definition = cast(Definition, entity)
            definition_info = EntityInformation(
                id_=definition.id_,
                type_="definition",
                bounding_boxes=boxes,
                data={
                    "definiendum": definition.definiendum,
                    "definition": definition.text,
                    "tex": definition.tex,
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{definition.tex_path}-{definition.sentence_id}"
                        if definition.sentence_id is not None else None,
                    ),
                },
            )
            definition_infos.append(definition_info)

        if entity.id_.startswith("definiendum") or entity.id_.startswith(
                "term-reference"):
            term = cast(TermReference, entity)
            term_info = EntityInformation(
                id_=term.id_,
                type_="term",
                bounding_boxes=boxes,
                data={
                    "name": term.text,
                    "definitions": term.definitions,
                    "definition_texs": term.definition_texs,
                    "sources": term.sources,
                    "term_type": term.type_ or "unknown"
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{term.tex_path}-{term.sentence_id}"
                        if term.sentence_id is not None else None,
                    ),
                    "definitions": [
                        EntityReference(type_="definition", id_=d)
                        for d in term.definition_ids
                    ],
                },
            )
            term_infos.append(term_info)

    # Upload definitions before terms, because terms hold references to definitions that can
    # only be resolved once the definitions have been uploaded.
    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        definition_infos,
        data_version,
    )
    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        term_infos,
        data_version,
    )
Example #8
0
    def save(self, item: SymbolData, _: None) -> None:
        symbols_with_ids = item.symbols_with_ids
        boxes = item.boxes
        matches = item.matches
        symbol_sentences = item.symbol_sentences

        symbol_ids_by_symbol_object_ids = {}
        for symbol_with_id in symbols_with_ids:
            symbol_ids_by_symbol_object_ids[id(
                symbol_with_id.symbol)] = symbol_with_id.symbol_id

        entity_infos = []

        for symbol_with_id in symbols_with_ids:
            symbol = symbol_with_id.symbol
            symbol_id = symbol_with_id.symbol_id

            box = boxes.get(symbol_id)
            if box is None:
                continue

            data: EntityData = {
                "tex":
                f"${symbol.tex}$",
                "tex_start":
                symbol.start,
                "tex_end":
                symbol.end,
                "mathml":
                symbol.mathml,
                "mathml_near_matches":
                [m.matching_mathml for m in matches[symbol.mathml]],
            }

            sentence_key = symbol_sentences.get(symbol_id)
            sentence_id = (
                f"{sentence_key.tex_path}-{sentence_key.sentence_id}"
                if sentence_key is not None else None)

            child_ids = []
            for child_symbol in symbol.children:
                child_symbol_id = symbol_ids_by_symbol_object_ids[id(
                    child_symbol)]
                string_id = f"{child_symbol_id.tex_path}-{child_symbol_id.equation_index}-{child_symbol_id.symbol_index}"
                child_ids.append(string_id)

            relationships: EntityRelationships = {
                "children": [
                    EntityReference(type_="symbol", id_=id_)
                    for id_ in child_ids
                ],
                "sentence":
                EntityReference(type_="sentence", id_=None)
                if sentence_id is None else EntityReference(type_="sentence",
                                                            id_=sentence_id),
            }

            entity_information = EntityInformation(
                id_=
                f"{symbol_id.tex_path}-{symbol_id.equation_index}-{symbol_id.symbol_index}",
                type_="symbol",
                bounding_boxes=[box],
                data=data,
                relationships=relationships,
            )
            entity_infos.append(entity_information)

        upload_entities(item.s2_id, item.arxiv_id, entity_infos,
                        self.args.data_version)