def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
def create_baseline(self, text_line: TextLineType) -> Optional[LineString]: if text_line.get_Baseline() is None or text_line.get_Baseline( ).points is None: return None points = np.array(polygon_from_points(text_line.get_Baseline().points)) points = transform_coordinates(points, self.coords['transform']) try: line = LineString(np.round(points).astype(np.int32)) except ValueError as err: self.logger.error('Page "%s" @ %s/Baseline %s', self.page_id, str(text_line.id), str(err)) return None if not line.is_valid: self.logger.error('Page "%s" @ %s/Baseline %s', self.page_id, str(text_line.id), str(explain_validity(line))) return None return line
def test_alternative_image_additions(): pcgts = PcGtsType(pcGtsId="foo") assert pcgts.pcGtsId == 'foo' # act # Page/AlternativeImage page = PageType() pcgts.set_Page(page) page.add_AlternativeImage(AlternativeImageType()) # TextRegion/AlternativeImage region = TextRegionType() page.add_TextRegion(region) region.add_AlternativeImage(AlternativeImageType()) # TextLine/AlternativeImage line = TextLineType() region.add_TextLine(line) line.add_AlternativeImage(AlternativeImageType()) # Word/AlternativeImage word = WordType() line.add_Word(word) word.add_AlternativeImage(AlternativeImageType()) # Glyph/AlternativeImage glyph = GlyphType() word.add_Glyph(glyph) glyph.add_AlternativeImage(AlternativeImageType())
def process(self): """ Segment with kraken """ log = getLogger('processor.KrakenSegment') for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) im = self.workspace.resolve_image_as_pil(image_url) log.info('Segmenting') log.info('Params %s', self.parameter) res = segment(im, self.parameter['text_direction'], self.parameter['scale'], self.parameter['maxcolseps'], self.parameter['black_colseps']) if self.parameter['script_detect']: res = detect_scripts(im, res) dummyRegion = TextRegionType() pcgts.get_Page().add_TextRegion(dummyRegion) # print(res) for lineno, box in enumerate(res['boxes']): textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType(points=points_from_x0y0x1y1(box))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'))
def test_alternativeImage(self): pcgts = PcGtsType(pcGtsId="foo") self.assertEqual(pcgts.pcGtsId, 'foo') # Page/AlternativeImage page = PageType() pcgts.set_Page(page) page.add_AlternativeImage(AlternativeImageType()) # TextRegion/AlternativeImage region = TextRegionType() page.add_TextRegion(region) region.add_AlternativeImage(AlternativeImageType()) # TextLine/AlternativeImage line = TextLineType() region.add_TextLine(line) line.add_AlternativeImage(AlternativeImageType()) # Word/AlternativeImage word = WordType() line.add_Word(word) word.add_AlternativeImage(AlternativeImageType()) # Glyph/AlternativeImage glyph = GlyphType() word.add_Glyph(glyph) glyph.add_AlternativeImage(AlternativeImageType())
def process(self): """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the (text) region level, and remove any existing TextLine elements (unless ``overwrite_lines`` is False). Set up Tesseract to detect lines, and add each one to the region at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrSegmentLine') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_lines = self.parameter['overwrite_lines'] with PyTessBaseAPI(psm=PSM.SINGLE_BLOCK, path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in itertools.chain.from_iterable( [page.get_TextRegion()] + [ subregion.get_TextRegion() for subregion in page.get_TableRegion() ]): if region.get_TextLine(): if overwrite_lines: LOG.info( 'removing existing TextLines in region "%s"', region.id) region.set_TextLine([]) else: LOG.warning( 'keeping existing TextLines in region "%s"', region.id) LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) tessapi.SetImage(region_image) for line_no, component in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_polygon = polygon_from_xywh(component[1]) line_polygon = coordinates_for_segment( line_polygon, region_image, region_coords) line_polygon2 = polygon_for_parent( line_polygon, region) if line_polygon2 is not None: line_polygon = line_polygon2 line_points = points_from_polygon(line_polygon) if line_polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant line: %s', line_points) continue region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _process_segment(self, page_image, page, textregion, region_xywh, page_id, input_file, n): LOG = getLogger('OcrdAnybaseocrTextline') #check for existing text lines and whether to overwrite them if textregion.get_TextLine(): if self.parameter['overwrite']: LOG.info('removing existing TextLines in region "%s"', page_id) textregion.set_TextLine([]) else: LOG.warning('keeping existing TextLines in region "%s"', page_id) return binary = ocrolib.pil2array(page_image) if len(binary.shape) > 2: binary = np.mean(binary, 2) binary = np.array(1 - binary / np.amax(binary), 'B') if self.parameter['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = self.parameter['scale'] if np.isnan( scale) or scale > 1000.0 or scale < self.parameter['minscale']: LOG.warning(str(scale) + ": bad scale; skipping!\n") return segmentation = self.compute_segmentation(binary, scale) if np.amax(segmentation) > self.parameter['maxlines']: LOG.warning("too many lines %i; skipping!\n", (np.amax(segmentation))) return lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] lines = [lines[i] for i in lsort] cleaned = ocrolib.remove_noise(binary, self.parameter['noise']) for i, l in enumerate(lines): #LOG.info('check this: ') #LOG.info(type(l.bounds)) #LOG.info(l.bounds) #line_points = np.where(l.mask==1) #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull #x,y = hull.exterior.coords.xy #LOG.info('hull coords x: ',x) #LOG.info('hull coords y: ',y) min_x, max_x = (l.bounds[0].start, l.bounds[0].stop) min_y, max_y = (l.bounds[1].start, l.bounds[1].stop) line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] #line_polygon = [x for x in zip(y, x)] line_polygon = coordinates_for_segment(line_polygon, page_image, region_xywh) line_points = points_from_polygon(line_polygon) img = cleaned[l.bounds[0], l.bounds[1]] img = np.array(255 * (img > ocrolib.midrange(img)), 'B') img = 255 - img img = ocrolib.array2pil(img) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( img, file_id + "_" + str(n) + "_" + str(i), page_id=page_id, file_grp=self.output_file_grp) ai = AlternativeImageType(filename=file_path, comments=region_xywh['features']) line_id = '%s_line%04d' % (page_id, i) line = TextLineType(custom='readingOrder {index:' + str(i) + ';}', id=line_id, Coords=CoordsType(line_points)) line.add_AlternativeImage(ai) textregion.add_TextLine(line)
def process(self): """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the (text) region level, and remove any existing TextLine elements (unless ``overwrite_lines`` is False). Set up Tesseract to detect lines, and add each one to the region at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_lines = self.parameter['overwrite_lines'] with PyTessBaseAPI( psm=PSM.SINGLE_BLOCK, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in itertools.chain.from_iterable( [page.get_TextRegion()] + [subregion.get_TextRegion() for subregion in page.get_TableRegion()]): if region.get_TextLine(): if overwrite_lines: LOG.info('removing existing TextLines in region "%s"', region.id) region.set_TextLine([]) else: LOG.warning('keeping existing TextLines in region "%s"', region.id) LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) region_polygon = coordinates_of_segment(region, region_image, region_coords) region_poly = Polygon(region_polygon) tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_polygon = polygon_from_xywh(component[1]) line_poly = Polygon(line_polygon) if not line_poly.within(region_poly): # this could happen due to rotation interline = line_poly.intersection(region_poly) if interline.is_empty: continue # ignore this line if hasattr(interline, 'geoms'): # is (heterogeneous) GeometryCollection area = 0 for geom in interline.geoms: if geom.area > area: area = geom.area interline = geom if not area: continue line_poly = interline.convex_hull line_polygon = line_poly.exterior.coords line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords) line_points = points_from_polygon(line_polygon) region.add_TextLine(TextLineType( id=line_id, Coords=CoordsType(line_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. Add the resulting sub-segments to the parent ``element``. If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: LOG.warning("Skipping '%s' with zero size", element_id) return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), np.bool) sep_bin = np.zeros_like(element_bin, np.bool) ignore_labels = np.zeros_like(element_bin, np.int) for i, segment in enumerate(ignore): LOG.debug('masking foreground of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, element_id) # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: # - ignored during text line segmentation (but not h/v-line detection) # - kept and reading-ordered during region segmentation (but not seps) segment_polygon = coordinates_of_segment(segment, image, coords) # If segment_polygon lies outside of element (causing # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. if isinstance(segment, SeparatorRegionType): sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], sep_bin.shape)] = True ignore_labels[draw.polygon( segment_polygon[:, 1], segment_polygon[:, 0], ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and isinstance(element.parent_object_, TableRegionType)): element_name = 'table' fullpage = True report = check_region(element_bin, zoom) else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) LOG.info('computing line segmentation for %s "%s"', element_name, element_id) # TODO: we should downscale if DPI is large enough to save time try: if report: raise Exception(report) line_labels, hlines, vlines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], maximages=self.parameter['maximages'] if element_name != 'table' else 0, csminheight=self.parameter['csminheight'], hlminwidth=self.parameter['hlminwidth']) except Exception as err: if isinstance(element, TextRegionType): LOG.error('Cannot line-segment region "%s": %s', element_id, err) # as a fallback, add a single text line comprising the whole region: element.add_TextLine( TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) return LOG.info('Found %d text lines for %s "%s"', len(np.unique(line_labels)) - 1, element_name, element_id) # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions try: # pass ignored regions as "line labels with initial assignment", # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices sepmask = np.maximum(np.maximum(hlines, vlines), np.maximum(sep_bin, images)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) LOG.info('Found %d text regions for %s "%s"', len(np.unique(region_labels)) - 1, element_name, element_id) except Exception as err: LOG.error('Cannot region-segment %s "%s": %s', element_name, element_id, err) region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 # start counting from largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 else: index = None # find contours around region labels (can be non-contiguous): region_no = 0 for region_label in np.unique(region_labels): if not region_label: continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) if not np.all(region_line_labels0 > len(ignore)): # existing region from `ignore` merely to be ordered # (no new region, no actual text lines) region_line_labels0 = np.intersect1d( region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ "region label %d has both existing regions and new lines (%s)" % ( region_label, str(region_line_labels0)) region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ == element and not isinstance( region, SeparatorRegionType): index = page_add_to_reading_order( rogroup, region.id, index) LOG.debug('Region label %d is for ignored region "%s"', region_label, region.id) continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally order = morph.reading_order(region_line_labels) order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, seps=np.maximum( sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) regions, _ = masks2polygons( region_mask * region_label, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) lines, _ = masks2polygons(region_line_labels, element_bin, 'region "%s"' % element_id, min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon in lines] for _, region_polygon in regions: region_poly = prep(Polygon(region_polygon)) # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( region_polygon, image, coords) region_polygon = polygon_for_parent( region_polygon, element) if region_polygon is None: LOG.warning( 'Ignoring extant region contour for region label %d', region_label) continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) region = TextRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: LOG.warning( 'Ignoring extant line contour for region label %d line label %d', region_label, line_label) continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) line = TextLineType( id=line_id, Coords=CoordsType( points=points_from_polygon(line_polygon))) region.add_TextLine(line) # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) LOG.info('Added region "%s" with %d lines for %s "%s"', region_id, line_no, element_name, element_id) if rogroup: index = page_add_to_reading_order( rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... image_labels, num_images = morph.label(images) LOG.info('Found %d large non-text/image regions for %s "%s"', num_images, element_name, element_id) # find contours around region labels (can be non-contiguous): image_polygons, _ = masks2polygons( image_labels, element_bin, '%s "%s"' % (element_name, element_id)) for image_label, polygon in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: LOG.warning( 'Ignoring extant region contour for image label %d', image_label) continue region_no += 1 # annotate result: region_id = element_id + "_image%04d" % region_no element.add_ImageRegion( ImageRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # split rulers into separator regions: hline_labels, num_hlines = morph.label(hlines) vline_labels, num_vlines = morph.label(vlines) LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines, num_vlines, element_name, element_id) # find contours around region labels (can be non-contiguous): hline_polygons, _ = masks2polygons( hline_labels, element_bin, '%s "%s"' % (element_name, element_id)) vline_polygons, _ = masks2polygons( vline_labels, element_bin, '%s "%s"' % (element_name, element_id)) for _, polygon in hline_polygons + vline_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: LOG.warning('Ignoring extant region contour for separator') continue # annotate result: region_no += 1 region_id = element_id + "_sep%04d" % region_no element.add_SeparatorRegion( SeparatorRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( image_clipped, file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) element.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=coords['features'] + ',clipped')) else: # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, np.bool) region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): line_polygons, _ = masks2polygons(line_labels, element_bin, 'region "%s"' % element_id, min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: LOG.warning( 'Ignoring extant line contour for line label %d', line_label) continue # annotate result: line_no += 1 line_id = element_id + "_line%04d" % line_no element.add_TextLine( TextLineType( id=line_id, Coords=CoordsType( points=points_from_polygon(line_polygon)))) if not sep_bin.any(): return # no derived image # annotate a text/image-separated image element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( image_clipped, file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): element.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=coords['features'] + ',clipped'))
def serialize_lines_in_marginal(self, marginal_region, all_found_texline_polygons_marginals, marginal_idx, page_coord, all_box_coord_marginals, slopes_marginals, counter): for j in range(len( all_found_texline_polygons_marginals[marginal_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) marginal_region.add_TextLine(textline) points_co = '' for l in range( len(all_found_texline_polygons_marginals[marginal_idx] [j])): if not self.curved_line: if len(all_found_texline_polygons_marginals[marginal_idx] [j][l]) == 2: textline_x_coord = max( 0, int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) textline_y_coord = max( 0, int((all_found_texline_polygons_marginals[ marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) else: textline_x_coord = max( 0, int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) textline_y_coord = max( 0, int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) points_co += str(textline_x_coord) points_co += ',' points_co += str(textline_y_coord) if self.curved_line and np.abs( slopes_marginals[marginal_idx]) <= 45: if len(all_found_texline_polygons_marginals[marginal_idx] [j][l]) == 2: points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][1] + page_coord[0]) / self.scale_y)) else: points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0][1] + page_coord[0]) / self.scale_y)) elif self.curved_line and np.abs( slopes_marginals[marginal_idx]) > 45: if len(all_found_texline_polygons_marginals[marginal_idx] [j][l]) == 2: points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) else: points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0][0] + all_box_coord_marginals[marginal_idx][2] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((all_found_texline_polygons_marginals[ marginal_idx][j][l][0][1] + all_box_coord_marginals[marginal_idx][0] + page_coord[0]) / self.scale_y)) points_co += ' ' coords.set_points(points_co[:-1])
def serialize_lines_in_region(self, text_region, all_found_texline_polygons, region_idx, page_coord, all_box_coord, slopes, counter): self.logger.debug('enter serialize_lines_in_region') for j in range(len(all_found_texline_polygons[region_idx])): coords = CoordsType() textline = TextLineType(id=counter.next_line_id, Coords=coords) text_region.add_TextLine(textline) region_bboxes = all_box_coord[region_idx] points_co = '' for idx_contour_textline, contour_textline in enumerate( all_found_texline_polygons[region_idx][j]): if not self.curved_line: if len(contour_textline) == 2: textline_x_coord = max( 0, int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) textline_y_coord = max( 0, int((contour_textline[1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) else: textline_x_coord = max( 0, int((contour_textline[0][0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) textline_y_coord = max( 0, int((contour_textline[0][1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) points_co += str(textline_x_coord) points_co += ',' points_co += str(textline_y_coord) if self.curved_line and np.abs(slopes[region_idx]) <= 45: if len(contour_textline) == 2: points_co += str( int((contour_textline[0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((contour_textline[1] + page_coord[0]) / self.scale_y)) else: points_co += str( int((contour_textline[0][0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((contour_textline[0][1] + page_coord[0]) / self.scale_y)) elif self.curved_line and np.abs(slopes[region_idx]) > 45: if len(contour_textline) == 2: points_co += str( int((contour_textline[0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((contour_textline[1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) else: points_co += str( int((contour_textline[0][0] + region_bboxes[2] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((contour_textline[0][1] + region_bboxes[0] + page_coord[0]) / self.scale_y)) points_co += ' ' coords.set_points(points_co[:-1])
def _process_segment(self, page_image, page, region_xywh, page_id, input_file, n): binary = ocrolib.pil2array(page_image) binary = np.array(1 - binary / np.amax(binary), 'B') if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1: min_x, max_x = (0, binary.shape[0]) min_y, max_y = (0, binary.shape[1]) textregion = TextRegionType( Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) page.add_TextRegion(textregion) else: textregion = page.get_TextRegion()[-1] ocrolib.write_image_binary("test.bin.png", binary) if self.parameter['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = self.parameter['scale'] if np.isnan( scale) or scale > 1000.0 or scale < self.parameter['minscale']: LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale)) return segmentation = self.compute_segmentation(binary, scale) if np.amax(segmentation) > self.parameter['maxlines']: LOG.warning("%s: too many lines %i", (fname, np.amax(segmentation))) return lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] lines = [lines[i] for i in lsort] cleaned = ocrolib.remove_noise(binary, self.parameter['noise']) region_xywh['features'] += ",textline" for i, l in enumerate(lines): ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0], l.bounds[1]]) min_x, max_x = (l.bounds[0].start, l.bounds[0].stop) min_y, max_y = (l.bounds[1].start, l.bounds[1].stop) img = binary[l.bounds[0], l.bounds[1]] img = np.array(255 * (img > ocrolib.midrange(img)), 'B') img = ocrolib.array2pil(img) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(img, file_id + "_" + str(i), page_id=page_id, file_grp=self.image_grp) ai = AlternativeImageType(filename=file_path, comments=region_xywh['features']) line = TextLineType( Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) line.add_AlternativeImage(ai) textregion.add_TextLine(line)