def _add_annotation(annotations, segment, imgid, catid, coords=None, mask=None): LOG = getLogger('processor.EvaluateSegmentation') score = segment.get_Coords().get_conf() or 1.0 polygon = polygon_from_points(segment.get_Coords().points) if len(polygon) < 3: LOG.warning('ignoring segment "%s" with only %d points', segment.id, len(polygon)) return xywh = xywh_from_polygon(polygon) if mask is None: segmentation = np.array(polygon).reshape(1, -1).tolist() else: polygon = coordinates_of_segment(segment, None, coords) py, px = draw.polygon(polygon[:, 1], polygon[:, 0], mask.shape) masked = np.zeros( mask.shape, dtype=np.uint8, order='F') # pycocotools.mask wants Fortran-contiguous arrays masked[py, px] = 1 * mask[py, px] segmentation = encodeMask(masked) annotations.append({ 'segment_id': segment.id, # non-standard string-valued in addition to 'id' 'image_id': imgid, 'category_id': catid, 'segmentation': segmentation, 'area': Polygon(polygon).area, 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], 'score': score, 'iscrowd': 0 })
def sanitize_page(self, page, page_id): regions = page.get_TextRegion() page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) for region in regions: LOG.info('Sanitizing region "%s"', region.id) lines = region.get_TextLine() heights = [] # get labels: region_mask = np.zeros((page_image.height, page_image.width), dtype=np.uint8) for line in lines: line_polygon = coordinates_of_segment(line, page_image, page_coords) heights.append(xywh_from_polygon(line_polygon)['h']) region_mask[draw.polygon(line_polygon[:, 1], line_polygon[:, 0], region_mask.shape)] = 1 region_mask[draw.polygon_perimeter(line_polygon[:, 1], line_polygon[:, 0], region_mask.shape)] = 1 # estimate scale: scale = int(np.median(np.array(heights))) # close labels: region_mask = np.pad(region_mask, scale) # protect edges region_mask = filters.maximum_filter(region_mask, (scale, 1), origin=0) region_mask = filters.minimum_filter(region_mask, (scale, 1), origin=0) region_mask = region_mask[scale:-scale, scale:-scale] # unprotect # find outer contour (parts): contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # determine areas of parts: areas = [cv2.contourArea(contour) for contour in contours] total_area = sum(areas) if not total_area: # ignore if too small LOG.warning('Zero contour area in region "%s"', region.id) continue # pick contour and convert to absolute: region_polygon = None for i, contour in enumerate(contours): area = areas[i] if area / total_area < 0.1: LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"', i, area, total_area, region.id) continue # simplify shape: polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y if len(polygon) < 4: LOG.warning('Ignoring contour %d less than 4 points in region "%s"', i, region.id) continue if region_polygon is not None: LOG.error('Skipping region "%s" due to non-contiguous contours', region.id) region_polygon = None break region_polygon = coordinates_for_segment(polygon, page_image, page_coords) if region_polygon is not None: LOG.info('Using new coordinates for region "%s"', region.id) region.get_Coords().points = points_from_polygon(region_polygon)
def test_xywh_from_polygon(self): self.assertEqual( xywh_from_polygon([[100, 100], [200, 100], [200, 200], [100, 200]]), { 'x': 100, 'y': 100, 'w': 100, 'h': 100 })
def process(self): """Extract page images and region descriptions (type and coordinates) from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Get all regions with their types (region element class), sub-types (@type) and coordinates relative to the page (which depending on the workflow could already be cropped, deskewed, dewarped, binarized etc). Extract the image of the (cropped, deskewed, dewarped) page, both in binarized form (if available) and non-binarized form. In addition, create a new image with masks for all regions, color-coded by type. Create two JSON files with region types and coordinates: one (page-wise) in our custom format and one (global) in MS-COCO. The output file group may be given as a comma-separated list to separate these 3 page-level images. Write files as follows: * in the first (or only) output file group (directory): - ID + '.png': raw image of the (preprocessed) page - ID + '.json': region coordinates/classes (custom format) * in the second (or first) output file group (directory): - ID + '.bin.png': binarized image of the (preprocessed) page, if available * in the third (or first) output file group (directory): - ID + '.dbg.png': debug image In addition, write a file for all pages at once: * in the third (or first) output file group (directory): - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format) - output_file_grp + '.colordict.json': color definitions (as in PAGE viewer) (This is intended for training and evaluation of region segmentation models.) """ file_groups = self.output_file_grp.split(',') if len(file_groups) > 3: raise Exception( "at most 3 output file grps allowed (raw, [binarized, [mask]] image)" ) if len(file_groups) > 2: dbg_image_grp = file_groups[2] else: dbg_image_grp = file_groups[0] LOG.info( "No output file group for debug images specified, falling back to output filegrp '%s'", dbg_image_grp) if len(file_groups) > 1: bin_image_grp = file_groups[1] else: bin_image_grp = file_groups[0] LOG.info( "No output file group for binarized images specified, falling back to output filegrp '%s'", bin_image_grp) self.output_file_grp = file_groups[0] # COCO: init data structures images = list() annotations = list() categories = list() i = 0 for cat, color in CLASSES.items(): # COCO format does not allow alpha channel color = (int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16)) try: supercat, name = cat.split(':') except ValueError: name = cat supercat = '' categories.append({ 'id': i, 'name': name, 'supercategory': supercat, 'source': 'PAGE', 'color': color }) i += 1 i = 0 # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) page_id = input_file.pageId or input_file.ID num_page_id = int(page_id.strip(page_id.strip("0123456789"))) LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() ptype = page.get_type() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized', transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None file_path = self.workspace.save_image_file( page_image, file_id, self.output_file_grp, page_id=page_id, mimetype=self.parameter['mimetype']) try: page_image_bin, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='binarized', transparency=self.parameter['transparency']) self.workspace.save_image_file(page_image_bin, file_id + '.bin', bin_image_grp, page_id=page_id) except Exception as err: if err.args[0].startswith('Found no AlternativeImage'): LOG.warning( 'Page "%s" has no binarized images, skipping .bin', page_id) else: raise page_image_dbg = Image.new(mode='RGBA', size=page_image.size, color='#' + CLASSES['']) if page.get_Border(): polygon = coordinates_of_segment(page.get_Border(), page_image, page_coords).tolist() ImageDraw.Draw(page_image_dbg).polygon( list(map(tuple, polygon)), fill='#' + CLASSES['Border']) else: page_image_dbg.paste( '#' + CLASSES['Border'], (0, 0, page_image.width, page_image.height)) regions = dict() for name in CLASSES.keys(): if not name or name == 'Border' or ':' in name: # no subtypes here continue regions[name] = getattr(page, 'get_' + name)() description = {'angle': page.get_orientation()} Neighbor = namedtuple('Neighbor', ['id', 'poly', 'type']) neighbors = [] for rtype, rlist in regions.items(): for region in rlist: if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: subrtype = region.get_type() else: subrtype = None polygon = coordinates_of_segment(region, page_image, page_coords) polygon2 = polygon.reshape(1, -1).tolist() polygon = polygon.tolist() xywh = xywh_from_polygon(polygon) # validate coordinates and check intersection with neighbours # (which would melt into another in the mask image): try: poly = Polygon(polygon) reason = '' except ValueError as err: reason = err if not poly.is_valid: reason = explain_validity(poly) elif poly.is_empty: reason = 'is empty' elif poly.bounds[0] < 0 or poly.bounds[1] < 0: reason = 'is negative' elif poly.length < 4: reason = 'has too few points' if reason: LOG.error('Page "%s" region "%s" %s', page_id, region.id, reason) continue poly_prep = prep(poly) for neighbor in neighbors: if (rtype == neighbor.type and poly_prep.intersects(neighbor.poly) and poly.intersection(neighbor.poly).area > 0): LOG.warning('Page "%s" region "%s" intersects neighbour "%s" (IoU: %.3f)', page_id, region.id, neighbor.id, poly.intersection(neighbor.poly).area / \ poly.union(neighbor.poly).area) elif (rtype != neighbor.type and poly_prep.within(neighbor.poly)): LOG.warning( 'Page "%s" region "%s" within neighbour "%s" (IoU: %.3f)', page_id, region.id, neighbor.id, poly.area / neighbor.poly.area) neighbors.append(Neighbor(region.id, poly, rtype)) area = poly.area description.setdefault('regions', []).append({ 'type': rtype, 'subtype': subrtype, 'coords': polygon, 'area': area, 'features': page_coords['features'], 'DPI': dpi, 'region.ID': region.id, 'page.ID': page_id, 'page.type': ptype, 'file_grp': self.input_file_grp, 'METS.UID': self.workspace.mets.unique_identifier }) # draw region: ImageDraw.Draw(page_image_dbg).polygon( list(map(tuple, polygon)), fill='#' + CLASSES[(rtype + ':' + subrtype) if subrtype else rtype]) # COCO: add annotations i += 1 annotations.append({ 'id': i, 'image_id': num_page_id, 'category_id': next( (cat['id'] for cat in categories if cat['name'] == subrtype), next((cat['id'] for cat in categories if cat['name'] == rtype))), 'segmentation': polygon2, 'area': area, 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], 'iscrowd': 0 }) self.workspace.save_image_file(page_image_dbg, file_id + '.dbg', dbg_image_grp, page_id=page_id, mimetype=self.parameter['mimetype']) self.workspace.add_file( ID=file_id + '.json', file_grp=dbg_image_grp, pageId=page_id, local_filename=file_path.replace( MIME_TO_EXT[self.parameter['mimetype']], '.json'), mimetype='application/json', content=json.dumps(description)) # COCO: add image images.append({ # COCO does not allow string identifiers: # -> use numerical part of page_id 'id': num_page_id, # all exported coordinates are relative to the cropped page: # -> use that for reference (instead of original page.imageFilename) 'file_name': file_path, # -> use its size (instead of original page.imageWidth/page.imageHeight) 'width': page_image.width, 'height': page_image.height }) # COCO: write result file_id = dbg_image_grp + '.coco.json' LOG.info('Writing COCO result file "%s" in "%s"', file_id, dbg_image_grp) self.workspace.add_file(ID=file_id, file_grp=dbg_image_grp, local_filename=os.path.join( dbg_image_grp, file_id), mimetype='application/json', content=json.dumps({ 'categories': categories, 'images': images, 'annotations': annotations })) # write inverse colordict (for ocrd-segment-from-masks) file_id = dbg_image_grp + '.colordict.json' LOG.info('Writing colordict file "%s" in .', file_id) with open(file_id, 'w') as out: json.dump( dict(('#' + col, name) for name, col in CLASSES.items() if name), out)
def process(self): """Extract page images and region descriptions (type and coordinates) from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Get all regions with their types (region element class), sub-types (@type) and coordinates relative to the page (which depending on the workflow could already be cropped, deskewed, dewarped, binarized etc). Extract the image of the (cropped, deskewed, dewarped) page, both in binarized form (if available) and raw form. For the latter, apply ``feature_filter`` (a comma-separated list of image features, cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. In addition, create a new (third) image with masks for each segment type in ``plot_segmasks``, color-coded by class according to ``colordict``. Create two JSON files with region types and coordinates: one (page-wise) in our custom format and one (global) in MS-COCO. The output file group may be given as a comma-separated list to separate these 3 kinds of images. If fewer than 3 fileGrps are specified, they will share the same fileGrp (and directory). In particular, write files as follows: * in the first (or only) output file group (directory): - ID + '.png': raw image of the page (preprocessed, but with ``feature_filter``) - ID + '.json': region coordinates/classes (custom format) * in the second (or only) output file group (directory): - ID + '.bin.png': binarized image of the (preprocessed) page, if available * in the third (or second or only) output file group (directory): - ID + '.pseg.png': mask image of page; contents depend on ``plot_segmasks``: 1. if it contains `page`, fill page frame, 2. if it contains `region`, fill region segmentation/classification, 3. if it contains `line`, fill text line segmentation, 4. if it contains `word`, fill word segmentation, 5. if it contains `glyph`, fill glyph segmentation, where each follow-up layer and segment draws over the previous state, starting with a blank (white) image - unless ``plot_overlay`` is true, in which case each layer and segment is superimposed (alpha blended) onto the previous one, starting with the above raw image. In addition, write a file for all pages at once: * in the third (or first) output file group (directory): - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format) - output_file_grp + '.colordict.json': the used ``colordict`` (This is intended for training and evaluation of region segmentation models.) """ LOG = getLogger('processor.ExtractPages') assert_file_grp_cardinality(self.input_file_grp, 1) file_groups = self.output_file_grp.split(',') if len(file_groups) > 3: raise Exception( "at most 3 output file grps allowed (raw, [binarized, [mask]] image)" ) if len(file_groups) > 2: mask_image_grp = file_groups[2] else: mask_image_grp = file_groups[0] LOG.info( "No output file group for mask images specified, falling back to output filegrp '%s'", mask_image_grp) if len(file_groups) > 1: bin_image_grp = file_groups[1] else: bin_image_grp = file_groups[0] LOG.info( "No output file group for binarized images specified, falling back to output filegrp '%s'", bin_image_grp) self.output_file_grp = file_groups[0] classes = self.parameter['colordict'] # COCO: init data structures images = list() annotations = list() categories = list() i = 0 for cat, color in classes.items(): # COCO format does not allow alpha channel color = (int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16)) try: supercat, name = cat.split(':') except ValueError: name = cat supercat = '' categories.append({ 'id': i, 'name': name, 'supercategory': supercat, 'source': 'PAGE', 'color': color }) i += 1 i = 0 # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID try: # separate non-numeric part of page ID to retain the numeric part num_page_id = int(page_id.strip(page_id.strip("0123456789"))) except Exception: num_page_id = n LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() ptype = page.get_type() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id, self.output_file_grp, page_id=page_id, mimetype=self.parameter['mimetype']) try: page_image_bin, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='binarized', transparency=self.parameter['transparency']) self.workspace.save_image_file(page_image_bin, file_id + '.bin', bin_image_grp, page_id=page_id) except Exception as err: if err.args[0].startswith('Found no AlternativeImage'): LOG.warning( 'Page "%s" has no binarized images, skipping .bin', page_id) else: raise # init multi-level mask output if self.parameter['plot_overlay']: page_image_segmask = page_image.convert('RGBA') else: page_image_segmask = Image.new(mode='RGBA', size=page_image.size, color='#FFFFFF00') neighbors = dict() for level in ['page', 'region', 'line', 'word', 'glyph']: neighbors[level] = list() # produce border mask plot, if necessary if page.get_Border(): poly = segment_poly(page_id, page.get_Border(), page_coords) else: poly = Polygon( polygon_from_bbox(0, 0, page_image.width, page_image.height)) if 'page' in self.parameter['plot_segmasks']: plot_segment(page_id, page.get_Border(), poly, 'Border', classes, page_image_segmask, [], self.parameter['plot_overlay']) # get regions and aggregate masks on all hierarchy levels description = {'angle': page.get_orientation()} regions = dict() for name in classes.keys(): if not name or not name.endswith('Region'): # no region subtypes or non-region types here continue #regions[name] = getattr(page, 'get_' + name)() regions[name] = page.get_AllRegions(classes=name[:-6], order='reading-order') for rtype, rlist in regions.items(): for region in rlist: if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: subrtype = region.get_type() else: subrtype = None if subrtype: rtype0 = rtype + ':' + subrtype else: rtype0 = rtype poly = segment_poly(page_id, region, page_coords) # produce region mask plot, if necessary if 'region' in self.parameter['plot_segmasks']: plot_segment(page_id, region, poly, rtype0, classes, page_image_segmask, neighbors['region'], self.parameter['plot_overlay']) if rtype == 'TextRegion': lines = region.get_TextLine() for line in lines: # produce line mask plot, if necessary poly = segment_poly(page_id, line, page_coords) if 'line' in self.parameter['plot_segmasks']: plot_segment(page_id, line, poly, 'TextLine', classes, page_image_segmask, neighbors['line'], self.parameter['plot_overlay']) words = line.get_Word() for word in words: # produce line mask plot, if necessary poly = segment_poly(page_id, word, page_coords) if 'word' in self.parameter['plot_segmasks']: plot_segment( page_id, word, poly, 'Word', classes, page_image_segmask, neighbors['word'], self.parameter['plot_overlay']) glyphs = word.get_Glyph() for glyph in glyphs: # produce line mask plot, if necessary poly = segment_poly( page_id, glyph, page_coords) if 'glyph' in self.parameter[ 'plot_segmasks']: plot_segment( page_id, glyph, poly, 'Glyph', classes, page_image_segmask, neighbors['glyph'], self.parameter['plot_overlay']) if not poly: continue polygon = np.array(poly.exterior, np.int)[:-1].tolist() xywh = xywh_from_polygon(polygon) area = poly.area description.setdefault('regions', []).append({ 'type': rtype, 'subtype': subrtype, 'coords': polygon, 'area': area, 'features': page_coords['features'], 'DPI': dpi, 'region.ID': region.id, 'page.ID': page_id, 'page.type': ptype, 'file_grp': self.input_file_grp, 'METS.UID': self.workspace.mets.unique_identifier }) # COCO: add annotations i += 1 annotations.append({ 'id': i, 'image_id': num_page_id, 'category_id': next( (cat['id'] for cat in categories if cat['name'] == subrtype), next((cat['id'] for cat in categories if cat['name'] == rtype))), 'segmentation': np.array(poly.exterior, np.int)[:-1].reshape(1, -1).tolist(), 'area': area, 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], 'iscrowd': 0 }) if 'order' in self.parameter['plot_segmasks']: plot_order(page.get_ReadingOrder(), classes, page_image_segmask, neighbors['region'], self.parameter['plot_overlay']) if self.parameter['plot_segmasks']: self.workspace.save_image_file( page_image_segmask, file_id + '.pseg', mask_image_grp, page_id=page_id, mimetype=self.parameter['mimetype']) self.workspace.add_file( ID=file_id + '.json', file_grp=mask_image_grp, pageId=input_file.pageId, local_filename=file_path.replace( MIME_TO_EXT[self.parameter['mimetype']], '.json'), mimetype='application/json', content=json.dumps(description)) # COCO: add image images.append({ # COCO does not allow string identifiers: # -> use numerical part of page_id 'id': num_page_id, # all exported coordinates are relative to the cropped page: # -> use that for reference (instead of original page.imageFilename) 'file_name': file_path, # -> use its size (instead of original page.imageWidth/page.imageHeight) 'width': page_image.width, 'height': page_image.height }) # COCO: write result file_id = mask_image_grp + '.coco.json' LOG.info('Writing COCO result file "%s" in "%s"', file_id, mask_image_grp) self.workspace.add_file(ID=file_id, file_grp=mask_image_grp, local_filename=os.path.join( mask_image_grp, file_id), mimetype='application/json', pageId=None, content=json.dumps({ 'categories': categories, 'images': images, 'annotations': annotations })) # write inverse colordict (for ocrd-segment-from-masks) file_id = mask_image_grp + '.colordict.json' LOG.info('Writing colordict file "%s" in .', file_id) with open(file_id, 'w') as out: json.dump( dict((col, name) for name, col in classes.items() if name), out)
def test_deskewing(plain_workspace): #from ocrd_utils import initLogging, setOverrideLogLevel #setOverrideLogLevel('DEBUG') size = (3000, 4000) poly = [[1403, 2573], [1560, 2573], [1560, 2598], [2311, 2598], [2311, 2757], [2220, 2757], [2220, 2798], [2311, 2798], [2311, 2908], [1403, 2908]] xywh = xywh_from_polygon(poly) bbox = bbox_from_polygon(poly) skew = 4.625 image = Image.new('L', size) image = polygon_mask(image, poly) #image.show(title='image') pixels = np.count_nonzero(np.array(image) > 0) name = 'foo0' assert plain_workspace.save_image_file(image, name, 'IMG') pcgts = page_from_file(next(plain_workspace.mets.find_files(ID=name))) page = pcgts.get_Page() region = TextRegionType( id='nonrect', Coords=CoordsType(points=points_from_polygon(poly)), orientation=-skew) page.add_TextRegion(region) page_image, page_coords, _ = plain_workspace.image_from_page(page, '') #page_image.show(title='page_image') assert list(image.getdata()) == list(page_image.getdata()) assert np.all(page_coords['transform'] == np.eye(3)) reg_image, reg_coords = plain_workspace.image_from_segment( region, page_image, page_coords, feature_filter='deskewed', fill=0) assert list(image.crop(bbox).getdata()) == list(reg_image.getdata()) assert reg_image.width == xywh['w'] == 908 assert reg_image.height == xywh['h'] == 335 assert reg_coords['transform'][0, 2] == -xywh['x'] assert reg_coords['transform'][1, 2] == -xywh['y'] # same fg after cropping to minimal bbox reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert pixels == reg_pixels # now with deskewing (test for size after recropping) reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0) #reg_image.show(title='reg_image') assert reg_image.width == 932 > xywh['w'] assert reg_image.height == 382 > xywh['h'] assert reg_coords['transform'][0, 1] != 0 assert reg_coords['transform'][1, 0] != 0 assert 'deskewed' in reg_coords['features'] # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert np.abs(pixels - reg_pixels) / pixels < 0.005 reg_array = np.array(reg_image) > 0 # now via AlternativeImage path = plain_workspace.save_image_file(reg_image, region.id + '_img', 'IMG') region.add_AlternativeImage( AlternativeImageType(filename=path, comments=reg_coords['features'])) logger_capture = FIFOIO(256) logger_handler = logging.StreamHandler(logger_capture) #logger_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) logger = logging.getLogger('ocrd_utils.crop_image') logger.addHandler(logger_handler) reg_image2, reg_coords2 = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0) #reg_image2.show(title='reg_image2') logger_output = logger_capture.getvalue() logger_capture.close() assert logger_output == '' assert reg_image2.width == reg_image.width assert reg_image2.height == reg_image.height assert np.allclose(reg_coords2['transform'], reg_coords['transform']) assert reg_coords2['features'] == reg_coords['features'] # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0) assert reg_pixels2 == reg_pixels reg_array2 = np.array(reg_image2) > 0 assert 0.98 < np.sum(reg_array == reg_array2) / reg_array.size <= 1.0
def sanitize_page(self, page, page_id): LOG = getLogger('processor.RepairSegmentation') regions = page.get_AllRegions(classes=['Text']) page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) for region in regions: LOG.info('Sanitizing region "%s"', region.id) lines = region.get_TextLine() if not lines: LOG.warning('Page "%s" region "%s" contains no textlines', page_id, region.id) continue heights = [] tops = [] # get labels: region_mask = np.zeros((page_image.height, page_image.width), dtype=np.uint8) for line in lines: line_polygon = coordinates_of_segment(line, page_image, page_coords) line_xywh = xywh_from_polygon(line_polygon) heights.append(line_xywh['h']) tops.append(line_xywh['y']) region_mask[draw.polygon(line_polygon[:, 1], line_polygon[:, 0], region_mask.shape)] = 1 region_mask[draw.polygon_perimeter(line_polygon[:, 1], line_polygon[:, 0], region_mask.shape)] = 1 # estimate scale: heights = np.array(heights) scale = int(np.max(heights)) tops = np.array(tops) order = np.argsort(tops) heights = heights[order] tops = tops[order] if len(lines) > 1: # if interline spacing is larger than line height, use this bottoms = tops + heights deltas = tops[1:] - bottoms[:-1] scale = max(scale, int(np.max(deltas))) # close labels: region_mask = np.pad(region_mask, scale) # protect edges region_mask = np.array(morphology.binary_closing( region_mask, np.ones((scale, 1))), dtype=np.uint8) region_mask = region_mask[scale:-scale, scale:-scale] # unprotect # extend margins (to ensure simplified hull polygon is outside children): region_mask = filters.maximum_filter(region_mask, 3) # 1px in each direction # find outer contour (parts): contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # determine areas of parts: areas = [cv2.contourArea(contour) for contour in contours] total_area = sum(areas) if not total_area: # ignore if too small LOG.warning('Zero contour area in region "%s"', region.id) continue # pick contour and convert to absolute: region_polygon = None for i, contour in enumerate(contours): area = areas[i] if area / total_area < 0.1: LOG.warning( 'Ignoring contour %d too small (%d/%d) in region "%s"', i, area, total_area, region.id) continue # simplify shape (until valid): # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y polygon = contour[:, 0, ::] # already ordered x,y polygon = Polygon(polygon).simplify(1) polygon = make_valid(polygon) polygon = polygon.exterior.coords[:-1] # keep open if len(polygon) < 4: LOG.warning( 'Ignoring contour %d less than 4 points in region "%s"', i, region.id) continue if region_polygon is not None: LOG.error( 'Skipping region "%s" due to non-contiguous contours', region.id) region_polygon = None break region_polygon = coordinates_for_segment( polygon, page_image, page_coords) if region_polygon is not None: LOG.info('Using new coordinates for region "%s"', region.id) region.get_Coords().set_points( points_from_polygon(region_polygon))