def find_neighbors_with_rsrcmgr( self, plane: Plane, ratio: float, rsrcmgr: PaperResourceManager) -> List[Union[LTItem, LTText]]: d = ratio * self.height objs = plane.find((self.x0, self.y0 - d, self.x1, self.y1 + d)) classification = self.maybe_classify(rsrcmgr) return [ obj for obj in objs if (isinstance(obj, LTTextLineHorizontalExtended) and classification == obj.maybe_classify(rsrcmgr) and ((abs(obj.height - self.height) < d and self.is_font_similar( obj) and self.is_x_similar(obj, d)) or classification in [LTAuthor, LTPageMargin, LTCitationBox, LTFooter])) ]
def __init__(self, mentions, lines, region, min_cell_size=6.0): """ Constructor """ self.min_cell_size = min_cell_size vlines, hlines = _split_vlines_hlines(lines) self.xs = [v.xc for v in vlines] self.ys = [h.yc for h in hlines] # Remove closely clustered lines # Also make sure there is at least 1 mega column for the table self.xs = _retain_centroids(self.xs + [region.x0, region.x1], min_cell_size) self.ys = _retain_centroids(self.ys + [region.y0, region.y1], min_cell_size) self.xranges = list(zip(self.xs, self.xs[1:])) self.yranges = list(zip(self.ys, self.ys[1:])) self.num_cols = len(self.xranges) self.num_rows = len(self.yranges) # Grid contents self._grid = np.full([self.num_rows, self.num_cols], None, dtype=np.dtype(object)) grid = self._grid # Record whether a particular cell boundary is present line_plane = Plane(region.bbox) line_plane.extend(lines) vbars, hbars = self._mark_grid_bounds(line_plane, region) cells = [] # Establish cell regions for i in range(self.num_rows): for j in range(self.num_cols): if grid[i, j]: continue # Skip already marked cells # Merge with cell above if i > 0 and not hbars[i, j]: grid[i, j] = cell = grid[i - 1, j] cell.rowend = i + 1 # Merge with cell left elif j > 0 and not vbars[i, j]: grid[i, j] = cell = grid[i, j - 1] cell.colend = j + 1 # Create new cell otherwise else: grid[i, j] = cell = Cell([i, j]) cells.append(cell) # Now get the cell's contents by using its boundary text_plane = Plane(region.bbox) text_plane.extend(mentions) for cell in cells: x0 = self.xs[cell.colstart] x1 = self.xs[cell.colend] y0 = self.ys[cell.rowstart] y1 = self.ys[cell.rowend] bbox = (x0, y0, x1, y1) # Keep mentions whose centers are inside the cell cell.texts = [ m for m in text_plane.find(bbox) if inside(bbox, (m.xc, m.yc) * 2) ] # TODO: provide HTML conversion here self.get_normalized_grid()
class Sheet1 (object): cells = None text_layout = None column_edges = None row_edges = None def __init__(self): self.cells = Plane() self.text_layout = Plane() self.row_edges = {} self.column_edges = {} def add_cell (self, cell): self.cells.add(cell) def add_text (self, cell_text): self.text_layout.add(cell_text) # if cell_text.text[:3] == 'Oil': print cell_text.text, cell_text.bbox def add_column_edge (self, x_value): x = round(x_value,2) self.column_edges[x] = 1+ self.column_edges.get(x,0) def add_row_edge (self, y_value): y = round(y_value,2) self.row_edges[y] = 1+ self.row_edges.get(y,0) def add_line (self, bbox): if bbox[0]==bbox[2]: # vertical line self.add_column_edge(bbox[0]) elif bbox[1]==bbox[3]: #horizontal line self.add_row_edge(bbox[1]) else: print ('WARNING: non-orthogonal line found: %s'%bbox) def add_rect (self, bbox): self.add_column_edge(bbox[0]) self.add_column_edge(bbox[2]) self.add_row_edge(bbox[1]) self.add_row_edge(bbox[3]) def add_ltcontainer (self, obj, page_y_offset): #NB: row indexes (y axis) are negative! bbox = ( round(obj.x0,2), round(-(obj.y1+page_y_offset),2), round(obj.x1,2), round(-(obj.y0+page_y_offset),2) ) if isinstance (obj, LTTextLine): self.add_text (CellText(bbox, obj.get_text())) elif isinstance (obj, LTLine): self.add_line(bbox) elif isinstance (obj, LTRect): self.add_rect(bbox) elif isinstance (obj, LTContainer): for child in obj: self.add_ltcontainer (child, page_y_offset) else: pass def extract_rows (self): # for obj in self.text_layout.find((690, -1200, 800, -1000)): # print obj.bbox,obj.text row_bounds = sorted(self.row_edges) col_bounds = sorted(self.column_edges) # pprint.pprint(col_bounds) rows = [] r0 = row_bounds[0] - 1 if row_bounds else 0 #NB: row indexes (y axis) are negative! for r1 in row_bounds: if r1 - r0 < 1: continue # print r1-r0,r0,r1 row=[] c0 = 0 for c1 in col_bounds: if c1 - c0 < 1: continue # print c0,r0,c1,r1 # get all text lines that intersect the bounds of this cell lines = [l for l in self.text_layout.find((c0,r0,c1,r1))] #sort from top to bottom lines = sorted(lines, key=lambda line: line.y0) # text = ' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1]) # if text[:10] == 'Production': print text,c0,r0,c1,r1 # if text[:3] == 'Oil': print text,c0,r0,c1,r1 # remove anything where the left edge is not inside the cell and concatenate the rest row.append(' '.join([t.text.strip() for t in lines if t.x0 >= c0 and t.x0 <= c1])) c0 = c1 rows.append(row) r0 = r1 return rows