Exemple #1
0
def subtract(rect1: FloatRectangle,
             rect2: FloatRectangle) -> Iterator[FloatRectangle]:
    """
    Get a collection of rectangles that make up the difference between 'rect1' and 'rect2'.
    The returned rectangles will have the dimensions of rectangles called "split[1..4]" below.
    Note that the number of rectangles returned will depend on how rect1 and rect2 overlap.

    rect1
    ----------------------------------------------
    |                                            |
    |                                   (split 1)|
    |············· -----------------·············|
    |              | rect2         |             |
    |     (split 2)|               |    (split 3)|
    |··············-----------------·············|
    |                                   (split 4)|
    ----------------------------------------------
    """

    rect1_right = rect1.left + rect1.width
    rect1_bottom = rect1.top + rect1.height
    rect2_right = rect2.left + rect2.width
    rect2_bottom = rect2.top + rect2.height

    if not are_intersecting(rect1, rect2):
        yield FloatRectangle(rect1.left, rect1.top, rect1.width, rect1.height)
        return

    # Create 'split 1'
    if rect2.top >= rect1.top and rect2.top <= rect1_bottom:
        height = rect2.top - rect1.top
        if height > 0:
            yield FloatRectangle(rect1.left, rect1.top, rect1.width, height)

    # Create 'split 2'
    if rect2.left >= rect1.left and rect2.left <= rect1_right:
        diff_top = max(rect1.top, rect2.top)
        diff_bottom = min(rect1_bottom, rect2_bottom)
        width = rect2.left - rect1.left
        height = diff_bottom - diff_top
        if width > 0 and height > 0:
            yield FloatRectangle(rect1.left, diff_top, width, height)

    # Create 'split 3'
    if rect2_right <= rect1_right and rect2_right >= rect1.left:
        diff_top = max(rect1.top, rect2.top)
        diff_bottom = min(rect1_bottom, rect2_bottom)
        width = rect1_right - rect2_right
        height = diff_bottom - diff_top
        if width > 0 and height > 0:
            yield FloatRectangle(rect2_right, diff_top, width, height)

    # Create 'split 4'
    if rect2_bottom <= rect1_bottom and rect2_bottom >= rect1.top:
        height = rect1_bottom - rect2_bottom
        if height > 0:
            yield FloatRectangle(rect1.left, rect2_bottom, rect1.width, height)
def fetch_boxes(arxiv_id: ArxivId, schema: str, version: Optional[int],
                types: List[str]) -> Optional[RegionsByPageAndType]:
    # Discover the most recent version of data in the database for the paper.

    setup_database_connections(schema)
    if version is None:
        version_number = (Version.select(fn.Max(
            Version.index)).join(Paper).where(
                Paper.arxiv_id == arxiv_id).scalar())
        if version_number is None:
            logging.warning(  # pylint: disable=logging-not-lazy
                "There are no entities for paper %s in database schema %s",
                arxiv_id,
                schema,
            )
            return None
        version = int(version_number)

    # Load bounding boxes from rows in the tables.
    rows = (EntityModel.select(
        EntityModel.id,
        EntityModel.type,
        BoundingBoxModel.left,
        BoundingBoxModel.top,
        BoundingBoxModel.width,
        BoundingBoxModel.height,
        BoundingBoxModel.page,
    ).join(Paper).switch(EntityModel).join(BoundingBoxModel).where(
        EntityModel.version == version,
        Paper.arxiv_id == arxiv_id,
        EntityModel.type << types,
    ).dicts())
    boxes_by_entity_db_id: Dict[str, List[BoundingBox]] = defaultdict(list)
    types_by_entity_db_id: Dict[str, str] = {}
    for row in rows:
        boxes_by_entity_db_id[row["id"]].append(
            BoundingBox(
                row["left"],
                row["top"],
                row["width"],
                row["height"],
                row["page"],
            ))
        types_by_entity_db_id[row["id"]] = row["type"]

    regions: RegionsByPageAndType = defaultdict(list)
    for db_id, bounding_boxes in boxes_by_entity_db_id.items():
        by_page = group_by_page(bounding_boxes)
        for page, page_boxes in by_page.items():
            key = (page, types_by_entity_db_id[db_id])
            rectangles = frozenset([
                FloatRectangle(b.left, b.top, b.width, b.height)
                for b in page_boxes
            ])
            regions[key].append(rectangles)

    return regions
Exemple #3
0
def fetch_boxes(arxiv_id: ArxivId, schema: str, version: Optional[int],
                types: List[str]) -> Optional[RegionsByPageAndType]:
    # Discover the most recent version of data in the database for the paper.

    setup_database_connections(schema)
    if version is None:
        version_number = (Version.select(fn.Max(
            Version.index)).join(Paper).where(
                Paper.arxiv_id == arxiv_id).scalar())
        if version_number is None:
            logging.warning(  # pylint: disable=logging-not-lazy
                "There are no entities for paper %s in database schema %s",
                arxiv_id,
                schema,
            )
            return None
        version = int(version_number)

    # Load bounding boxes from rows in the tables.
    # Note that filtering of entities occurs in two stages:
    # 1. In the 'where' function call in the query below.
    # 2. When iterating over the returned 'rows' object.
    # In general, it is encouraged to write filtering conditions in the where clause to
    # consolidate conditions and as it could be faster. That said, if it is particularly tricky
    # to filter (e.g., involving many interrelated conditions), filters can be written as rules
    # in the loop over the rows.
    rows = (
        EntityModel.select(
            EntityModel.id,
            EntityModel.type,
            BoundingBoxModel.left,
            BoundingBoxModel.top,
            BoundingBoxModel.width,
            BoundingBoxModel.height,
            BoundingBoxModel.page,
            # Aggregate data for an entity into an array, where each field
            # is a dictionary: {"key", "...", "value", "..."}. All values will
            # be of type string.
            fn.json_agg(
                fn.json_build_object("key", EntityDataModel.key, "value",
                                     EntityDataModel.value)).alias("data"),
        ).join(Paper).switch(EntityModel).join(BoundingBoxModel).switch(
            EntityModel).join(EntityDataModel).where(
                EntityModel.version == version,
                Paper.arxiv_id == arxiv_id,
                EntityModel.type << types,
            ).group_by(
                EntityModel.id,
                EntityModel.type,
                BoundingBoxModel.left,
                BoundingBoxModel.top,
                BoundingBoxModel.width,
                BoundingBoxModel.height,
                BoundingBoxModel.page,
            ).dicts())

    boxes_by_entity_db_id: Dict[str, List[BoundingBox]] = defaultdict(list)
    types_by_entity_db_id: Dict[str, str] = {}
    for row in rows:
        if row["type"] == "symbol":
            if any([
                    d["key"] == "type" and d["value"] not in GOLD_SYMBOL_TYPES
                    for d in row["data"]
            ]):
                continue

        boxes_by_entity_db_id[row["id"]].append(
            BoundingBox(
                row["left"],
                row["top"],
                row["width"],
                row["height"],
                row["page"],
            ))
        types_by_entity_db_id[row["id"]] = row["type"]

    regions: RegionsByPageAndType = defaultdict(list)
    for db_id, bounding_boxes in boxes_by_entity_db_id.items():
        by_page = group_by_page(bounding_boxes)
        for page, page_boxes in by_page.items():
            key = (page, types_by_entity_db_id[db_id])
            rectangles = frozenset([
                FloatRectangle(b.left, b.top, b.width, b.height)
                for b in page_boxes
            ])
            regions[key].append(rectangles)

    return regions