コード例 #1
0
ファイル: locate_citations.py プロジェクト: silky/scholarphi
 def process(self, item: LocationTask) -> Iterator[CitationLocation]:
     for i, cluster in enumerate(cluster_boxes(item.boxes)):
         logging.debug(
             "Found cluster of %d box(es) for citations of key %s for paper %s",
             len(cluster),
             item.citation_key,
             item.arxiv_id,
         )
         for box in cluster:
             yield CitationLocation(
                 key=item.citation_key,
                 cluster_index=i,
                 page=box.page,
                 left=box.left,
                 top=box.top,
                 width=box.width,
                 height=box.height,
             )
コード例 #2
0
def test_cluster_boxes():

    cluster1_boxes = [
        box(0, 0, 0.01, 0.01, page=1),
        box(0.02, 0, 0.01, 0.01,
            page=1),  # boxes need not overlap horizontally
        box(0, 0.005, 0.01, 0.02, page=1),
        box(0, 0.014, 0.01, 0.01, page=1),
    ]
    cluster2_boxes = [box(0, 0.05, 0.01, 0.01,
                          page=1)]  # too far below cluster 1
    cluster3_boxes = [box(0, 0, 0.01, 0.01, page=2)]  # on a new page

    all_boxes = cluster1_boxes + cluster2_boxes + cluster3_boxes
    clusters = list(cluster_boxes(all_boxes))
    assert len(clusters) == 3
    assert clusters[0] == set(cluster1_boxes)
    assert clusters[1] == set(cluster2_boxes)
    assert clusters[2] == set(cluster3_boxes)
コード例 #3
0
ファイル: upload.py プロジェクト: silky/scholarphi
def upload_terms(processing_summary: PaperProcessingResult,
                 data_version: Optional[int]) -> None:

    arxiv_id = processing_summary.arxiv_id
    contexts = file_utils.load_from_csv(
        os.path.join(
            directories.arxiv_subdir("contexts-for-glossary-terms", arxiv_id),
            "contexts.csv",
        ),
        Context,
    )
    contexts_by_entity = {(c.tex_path, c.entity_id): c for c in contexts}

    # Assemble contexts that should be shown for each term.
    contexts_by_term: Dict[str, List[Context]] = defaultdict(list)
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        if (term.tex_path, term.id_) in contexts_by_entity:
            contexts_by_term[term.text].append(
                contexts_by_entity[(term.tex_path, term.id_)])

    entity_infos = []
    for entity_and_location in processing_summary.localized_entities:
        term = cast(Term, entity_and_location.entity)
        context = contexts_by_entity.get((term.tex_path, term.id_))
        boxes = [cast(BoundingBox, l) for l in entity_and_location.locations]

        # Cluster bounding boxes, in case any of these terms are defined as a macro (in which)
        # case all appearances of that term on the same page will have been lumped together.
        clusters = cluster_boxes(boxes, vertical_split=0.005)
        for i, cluster in enumerate(clusters):
            entity_info = EntityInformation(
                id_=f"{term.tex_path}-{term.id_}-{i}",
                type_="term",
                bounding_boxes=list(cluster),
                data={
                    "name":
                    term.text,
                    "definitions":
                    term.definitions,
                    "definition_texs":
                    term.definitions,
                    "sources":
                    term.sources,
                    "snippets":
                    [c.snippet for c in contexts_by_term.get(term.text, [])],
                },
                relationships={
                    "sentence":
                    EntityReference(
                        type_="sentence",
                        id_=f"{context.tex_path}-{context.sentence_id}"
                        if context is not None else None,
                    ),
                    "snippet_sentences": [
                        EntityReference(type_="sentence",
                                        id_=f"{c.tex_path}-{c.sentence_id}")
                        for c in contexts_by_term.get(term.text, [])
                    ],
                },
            )
            entity_infos.append(entity_info)

    upload_entities(
        processing_summary.s2_id,
        processing_summary.arxiv_id,
        entity_infos,
        data_version,
    )