def find_columns(self): """Get columns in a section of the image """ portion = iulib.bytearray() iulib.extract_subimage(portion, self.inverted, 0, 0, self.inverted.dim(0), self.topptr) projection = high_pass_median(iulib.numpy(portion).sum(axis=1), 0.20) posscols = self.get_possible_columns(projection) bestcols = self.filter_columns(posscols, int(self._params.get("columns", 1))) self.columns.extend(bestcols)
def init(self): """Initialise on receipt of the input.""" # pointer to the region that remains # to be segmented - starts at the top self.topptr = self.inarray.dim(1) # obtain an inverted version of the array self.inverted = iulib.bytearray() self.inverted.copy(self.inarray) iulib.binary_invert(self.inverted) self.calc_bounding_boxes() # list of extracted line rectangles self.textlines = [] self.columns = []
def remove_border(narray, average_char_height): """Try and remove anything that's in a likely border region and return the subimage.""" na = iulib.numpy(narray) hpr = na.sum(axis=0) vpr = na.sum(axis=1) hhp = high_pass_median(hpr, 5.0 / average_char_height) vhp = high_pass_median(vpr, 5.0 / average_char_height) vidx = vhp.nonzero()[0] hidx = hhp.nonzero()[0] b = iulib.bytearray() iulib.extract_subimage(b, narray, int(vidx[0]), int(hidx[0]), int(vidx[-1]), int(hidx[-1])) return b
def find_lines(self): """ Get lines in a section of the images. """ for colrect in self.columns: newrect = Rectangle(colrect.x0, 0, colrect.x1, self.topptr) if newrect.area() < 1: continue portion = iulib.bytearray() iulib.extract_subimage(portion, self.inverted, *newrect.points()) regions = get_lines_by_projection( portion, float(self._params.get("highpass"))) plines = [] for bottom, top in regions: height = top - bottom if height - self.avgheight < self.avgheight / 3: continue plines.append(Rectangle(colrect.x0, bottom, colrect.x1, top)) cpline = None clline = Rectangle(0, 0, 0, 0) charboxes = self.get_char_boxes(self.boxes) colboxes = [b for b in charboxes \ if b.overlaps(colrect.grow(10, 10))] colboxes.sort(lambda x, y: cmp(x.y1, y.y1)) colboxes.reverse() clines = [] for p in plines: clines.append(Rectangle(0, 0, 0, 0)) while colboxes: char = colboxes.pop(0) cline = Rectangle(0, 0, 0, 0) for i in range(len(plines)): pline = plines[i] if char.overlaps(pline): clines[i].include(char) self.textlines.extend(clines)
def find_lines(self): """ Get lines in a section of the images. """ for colrect in self.columns: newrect = Rectangle(colrect.x0, 0, colrect.x1, self.topptr) if newrect.area() < 1: continue portion = iulib.bytearray() iulib.extract_subimage(portion, self.inverted, *newrect.points()) regions = get_lines_by_projection(portion, float(self._params.get("highpass"))) plines = [] for bottom, top in regions: height = top - bottom if height - self.avgheight < self.avgheight / 3: continue plines.append(Rectangle(colrect.x0, bottom, colrect.x1, top)) cpline = None clline = Rectangle(0, 0, 0, 0) charboxes = self.get_char_boxes(self.boxes) colboxes = [b for b in charboxes \ if b.overlaps(colrect.grow(10, 10))] colboxes.sort(lambda x, y: cmp(x.y1, y.y1)) colboxes.reverse() clines = [] for p in plines: clines.append(Rectangle(0, 0, 0, 0)) while colboxes: char = colboxes.pop(0) cline = Rectangle(0, 0, 0, 0) for i in range(len(plines)): pline = plines[i] if char.overlaps(pline): clines[i].include(char) self.textlines.extend(clines)