def subtract(rect1: FloatRectangle, rect2: FloatRectangle) -> Iterator[FloatRectangle]: """ Get a collection of rectangles that make up the difference between 'rect1' and 'rect2'. The returned rectangles will have the dimensions of rectangles called "split[1..4]" below. Note that the number of rectangles returned will depend on how rect1 and rect2 overlap. rect1 ---------------------------------------------- | | | (split 1)| |············· -----------------·············| | | rect2 | | | (split 2)| | (split 3)| |··············-----------------·············| | (split 4)| ---------------------------------------------- """ rect1_right = rect1.left + rect1.width rect1_bottom = rect1.top + rect1.height rect2_right = rect2.left + rect2.width rect2_bottom = rect2.top + rect2.height if not are_intersecting(rect1, rect2): yield FloatRectangle(rect1.left, rect1.top, rect1.width, rect1.height) return # Create 'split 1' if rect2.top >= rect1.top and rect2.top <= rect1_bottom: height = rect2.top - rect1.top if height > 0: yield FloatRectangle(rect1.left, rect1.top, rect1.width, height) # Create 'split 2' if rect2.left >= rect1.left and rect2.left <= rect1_right: diff_top = max(rect1.top, rect2.top) diff_bottom = min(rect1_bottom, rect2_bottom) width = rect2.left - rect1.left height = diff_bottom - diff_top if width > 0 and height > 0: yield FloatRectangle(rect1.left, diff_top, width, height) # Create 'split 3' if rect2_right <= rect1_right and rect2_right >= rect1.left: diff_top = max(rect1.top, rect2.top) diff_bottom = min(rect1_bottom, rect2_bottom) width = rect1_right - rect2_right height = diff_bottom - diff_top if width > 0 and height > 0: yield FloatRectangle(rect2_right, diff_top, width, height) # Create 'split 4' if rect2_bottom <= rect1_bottom and rect2_bottom >= rect1.top: height = rect1_bottom - rect2_bottom if height > 0: yield FloatRectangle(rect1.left, rect2_bottom, rect1.width, height)
def fetch_boxes(arxiv_id: ArxivId, schema: str, version: Optional[int], types: List[str]) -> Optional[RegionsByPageAndType]: # Discover the most recent version of data in the database for the paper. setup_database_connections(schema) if version is None: version_number = (Version.select(fn.Max( Version.index)).join(Paper).where( Paper.arxiv_id == arxiv_id).scalar()) if version_number is None: logging.warning( # pylint: disable=logging-not-lazy "There are no entities for paper %s in database schema %s", arxiv_id, schema, ) return None version = int(version_number) # Load bounding boxes from rows in the tables. rows = (EntityModel.select( EntityModel.id, EntityModel.type, BoundingBoxModel.left, BoundingBoxModel.top, BoundingBoxModel.width, BoundingBoxModel.height, BoundingBoxModel.page, ).join(Paper).switch(EntityModel).join(BoundingBoxModel).where( EntityModel.version == version, Paper.arxiv_id == arxiv_id, EntityModel.type << types, ).dicts()) boxes_by_entity_db_id: Dict[str, List[BoundingBox]] = defaultdict(list) types_by_entity_db_id: Dict[str, str] = {} for row in rows: boxes_by_entity_db_id[row["id"]].append( BoundingBox( row["left"], row["top"], row["width"], row["height"], row["page"], )) types_by_entity_db_id[row["id"]] = row["type"] regions: RegionsByPageAndType = defaultdict(list) for db_id, bounding_boxes in boxes_by_entity_db_id.items(): by_page = group_by_page(bounding_boxes) for page, page_boxes in by_page.items(): key = (page, types_by_entity_db_id[db_id]) rectangles = frozenset([ FloatRectangle(b.left, b.top, b.width, b.height) for b in page_boxes ]) regions[key].append(rectangles) return regions
def fetch_boxes(arxiv_id: ArxivId, schema: str, version: Optional[int], types: List[str]) -> Optional[RegionsByPageAndType]: # Discover the most recent version of data in the database for the paper. setup_database_connections(schema) if version is None: version_number = (Version.select(fn.Max( Version.index)).join(Paper).where( Paper.arxiv_id == arxiv_id).scalar()) if version_number is None: logging.warning( # pylint: disable=logging-not-lazy "There are no entities for paper %s in database schema %s", arxiv_id, schema, ) return None version = int(version_number) # Load bounding boxes from rows in the tables. # Note that filtering of entities occurs in two stages: # 1. In the 'where' function call in the query below. # 2. When iterating over the returned 'rows' object. # In general, it is encouraged to write filtering conditions in the where clause to # consolidate conditions and as it could be faster. That said, if it is particularly tricky # to filter (e.g., involving many interrelated conditions), filters can be written as rules # in the loop over the rows. rows = ( EntityModel.select( EntityModel.id, EntityModel.type, BoundingBoxModel.left, BoundingBoxModel.top, BoundingBoxModel.width, BoundingBoxModel.height, BoundingBoxModel.page, # Aggregate data for an entity into an array, where each field # is a dictionary: {"key", "...", "value", "..."}. All values will # be of type string. fn.json_agg( fn.json_build_object("key", EntityDataModel.key, "value", EntityDataModel.value)).alias("data"), ).join(Paper).switch(EntityModel).join(BoundingBoxModel).switch( EntityModel).join(EntityDataModel).where( EntityModel.version == version, Paper.arxiv_id == arxiv_id, EntityModel.type << types, ).group_by( EntityModel.id, EntityModel.type, BoundingBoxModel.left, BoundingBoxModel.top, BoundingBoxModel.width, BoundingBoxModel.height, BoundingBoxModel.page, ).dicts()) boxes_by_entity_db_id: Dict[str, List[BoundingBox]] = defaultdict(list) types_by_entity_db_id: Dict[str, str] = {} for row in rows: if row["type"] == "symbol": if any([ d["key"] == "type" and d["value"] not in GOLD_SYMBOL_TYPES for d in row["data"] ]): continue boxes_by_entity_db_id[row["id"]].append( BoundingBox( row["left"], row["top"], row["width"], row["height"], row["page"], )) types_by_entity_db_id[row["id"]] = row["type"] regions: RegionsByPageAndType = defaultdict(list) for db_id, bounding_boxes in boxes_by_entity_db_id.items(): by_page = group_by_page(bounding_boxes) for page, page_boxes in by_page.items(): key = (page, types_by_entity_db_id[db_id]) rectangles = frozenset([ FloatRectangle(b.left, b.top, b.width, b.height) for b in page_boxes ]) regions[key].append(rectangles) return regions