def recursive_xy_divide(elems, avg_font_size): """ Recursively group/divide the document by white stripes by projecting elements onto alternating axes as intervals. avg_font_size: the minimum gap size between elements below which we consider interval continuous. """ log = logging.getLogger(__name__) log.info(avg_font_size) objects = list(elems.mentions) objects.extend(elems.segments) bboxes = [] # A tree that is a list of its children # bboxes can be recursively reconstructed from # the leaves def divide(objs, bbox, h_split=True, is_single=False): """ Recursive wrapper for splitting a list of objects with bounding boxes. h_split: whether to split along y axis, otherwise we split along x axis. """ if not objs: return [] # range start/end indices axis = 1 if h_split else 0 intervals, groups = project_onto(objs, axis, avg_font_size) # base case where we can not actually divide single_child = len(groups) == 1 # Can not divide in both X and Y, stop if is_single and single_child: bboxes.append(bbox) return objs else: children = [] for interval, group in zip(intervals, groups): # Create the bbox for the subgroup sub_bbox = np.array(bbox) sub_bbox[[axis, axis + 2]] = interval # Append the sub-document tree child = divide(group, sub_bbox, not h_split, single_child) children.append(child) return children full_page_bbox = (0, 0, elems.layout.width, elems.layout.height) # Filter out invalid objects objects = [o for o in objects if inside(full_page_bbox, o.bbox)] log.info("avg_font_size for dividing", avg_font_size) tree = divide(objects, full_page_bbox) if objects else [] return bboxes, tree
def __init__(self, mentions, lines, region, min_cell_size=6.0): """ Constructor """ self.min_cell_size = min_cell_size vlines, hlines = _split_vlines_hlines(lines) self.xs = [v.xc for v in vlines] self.ys = [h.yc for h in hlines] # Remove closely clustered lines # Also make sure there is at least 1 mega column for the table self.xs = _retain_centroids(self.xs + [region.x0, region.x1], min_cell_size) self.ys = _retain_centroids(self.ys + [region.y0, region.y1], min_cell_size) self.xranges = list(zip(self.xs, self.xs[1:])) self.yranges = list(zip(self.ys, self.ys[1:])) self.num_cols = len(self.xranges) self.num_rows = len(self.yranges) # Grid contents self._grid = np.full([self.num_rows, self.num_cols], None, dtype=np.dtype(object)) grid = self._grid # Record whether a particular cell boundary is present line_plane = Plane(region.bbox) line_plane.extend(lines) vbars, hbars = self._mark_grid_bounds(line_plane, region) cells = [] # Establish cell regions for i in range(self.num_rows): for j in range(self.num_cols): if grid[i, j]: continue # Skip already marked cells # Merge with cell above if i > 0 and not hbars[i, j]: grid[i, j] = cell = grid[i - 1, j] cell.rowend = i + 1 # Merge with cell left elif j > 0 and not vbars[i, j]: grid[i, j] = cell = grid[i, j - 1] cell.colend = j + 1 # Create new cell otherwise else: grid[i, j] = cell = Cell([i, j]) cells.append(cell) # Now get the cell's contents by using its boundary text_plane = Plane(region.bbox) text_plane.extend(mentions) for cell in cells: x0 = self.xs[cell.colstart] x1 = self.xs[cell.colend] y0 = self.ys[cell.rowstart] y1 = self.ys[cell.rowend] bbox = (x0, y0, x1, y1) # Keep mentions whose centers are inside the cell cell.texts = [ m for m in text_plane.find(bbox) if inside(bbox, (m.xc, m.yc) * 2) ] # TODO: provide HTML conversion here self.get_normalized_grid()