def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, page_id, file_id): LOG = getLogger('processor.OcropyClip') # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join([ feature for feature in parent_coords['features'].split(',') if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped'] ]) + ',clipped' # mask segment within parent image: segment_image = image_from_polygon(parent_image, segment_polygon) segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): LOG.info( 'Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', neighbour.id, segment.id, page_id) continue # find connected components that (only) belong to the neighbour: intruders = segment_mask * morph.keep_marked( parent_bin, neighbour_mask > 0) # overlaps neighbour intruders = morph.remove_marked( intruders, segment_mask > neighbour_mask) # but exclusively num_intruders = np.count_nonzero(intruders) num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue LOG.debug( 'segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', segment.id, neighbour.id, num_intruders, num_foreground, page_id) # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) segment_mask -= intruders # suppress in derived image result to be annotated clip_mask = array2pil(intruders) segment_image.paste(background_image, mask=clip_mask) # suppress in raw image if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']: # for consumers that do not have to rely on our # guessed background color, but can cope with transparency: segment_image.putalpha(ImageOps.invert(clip_mask)) # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): segment_image = crop_image(segment_image, box=segment_bbox) # update METS (add the image file): file_path = self.workspace.save_image_file( segment_image, file_id=file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): segment.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=features))
def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id): """Set the identified page border, if valid.""" LOG = getLogger('processor.TesserocrCrop') left, top, right, bottom = bounds if left >= right or top >= bottom: LOG.error("Cannot find valid extent for page '%s'", page_id) return padding = self.parameter['padding'] # add padding: left = max(left - padding, 0) right = min(right + padding, page_image.width) top = max(top - padding, 0) bottom = min(bottom + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom) polygon = polygon_from_bbox(left, top, right, bottom) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) if polygon is None: LOG.error("Ignoring extant border") return border = BorderType(Coords=CoordsType( points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_xywh['features']))
def image_from_segment(self, segment, parent_image, parent_coords, fill='background', transparency=False, feature_selector='', feature_filter=''): """Extract an image for a PAGE-XML hierarchy segment from its parent's image. Given... * ``parent_image``, a PIL.Image of the parent, with * ``parent_coords``, a dict with information about ``parent_image``: - ``transform``: a Numpy array with an affine transform which converts from absolute coordinates to those relative to the image, i.e. after applying all operations (starting with the original image) - ``angle``: the rotation/reflection angle applied to the image so far, - ``features``: the AlternativeImage @comments for the image, i.e. names of all operations that lead up to this result, and * ``segment``, a PAGE segment object logically contained in it (i.e. TextRegionType / TextLineType / WordType / GlyphType), ...extract the segment's corresponding PIL.Image, either from AlternativeImage (if it exists), or producing a new image via cropping from ``parent_image`` (otherwise). If ``feature_selector`` and/or ``feature_filter`` is given, then select/filter among the cropped ``parent_image`` and the available AlternativeImages the last one which contains all of the selected, but none of the filtered features (i.e. @comments classes), or raise an error. (Required and produced features need not be in the same order, so ``feature_selector`` is merely a mask specifying Boolean AND, and ``feature_filter`` is merely a mask specifying Boolean OR.) Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to ``fill``: - if ``background`` (the default), then fill with the median color of the image; - otherwise, use the given color, e.g. ``white`` or (255,255,255). Moreover, if ``transparency`` is true, and unless the image already has an alpha channel, then add an alpha channel which is fully opaque before cropping and rotating. (Thus, only the exposed areas will be transparent afterwards, for those that can interpret alpha channels). When cropping, compensate any @orientation angle annotated for the parent (from parent-level deskewing) by rotating the segment coordinates in an inverse transformation (i.e. translation to center, then passive rotation, and translation back). Regardless, if any @orientation angle is annotated for the segment (from segment-level deskewing), and the chosen image does not have the feature "deskewed" yet, and unless "deskewed" is being filtered, then rotate it - compensating for any previous ``angle``. (However, if @orientation is above the [-45°,45°] interval, then apply as much transposition as possible first, unless "rotated-90" / "rotated-180" / "rotated-270" is being filtered.) Return a tuple: * the extracted image, * a dictionary with information about the extracted image: - ``transform``: a Numpy array with an affine transform which converts from absolute coordinates to those relative to the image, i.e. after applying all parent operations, and then cropping to the segment's bounding box, and deskewing with the segment's orientation angle (if any) - ``angle``: the rotation/reflection angle applied to the image so far, - ``features``: the AlternativeImage @comments for the image, i.e. names of all operations that lead up to this result. (These can be used to create a new AlternativeImage, or passed down for calls on lower hierarchy levels.) Example: * get a raw (colored) but already deskewed and cropped image: ``image, xywh = workspace.image_from_segment(region, page_image, page_xywh, feature_selector='deskewed,cropped', feature_filter='binarized,grayscale_normalized')`` """ log = getLogger('ocrd.workspace.image_from_segment') # note: We should mask overlapping neighbouring segments here, # but finding the right clipping rules can be difficult if operating # on the raw (non-binary) image data alone: for each intersection, it # must be decided which one of either segment or neighbour to assign, # e.g. an ImageRegion which properly contains our TextRegion should be # completely ignored, but an ImageRegion which is properly contained # in our TextRegion should be completely masked, while partial overlap # may be more difficult to decide. On the other hand, on the binary image, # we can use connected component analysis to mask foreground areas which # originate in the neighbouring regions. But that would introduce either # the assumption that the input has already been binarized, or a dependency # on some ad-hoc binarization method. Thus, it is preferable to use # a dedicated processor for this (which produces clipped AlternativeImage # or reduced polygon coordinates). # get polygon outline of segment relative to parent image: segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) # get relative bounding box: segment_bbox = bbox_from_polygon(segment_polygon) # get size of the segment in the parent image after cropping # (i.e. possibly different from size before rotation at the parent, but # also possibly different from size after rotation below/AlternativeImage): segment_xywh = xywh_from_bbox(*segment_bbox) # create a mask from the segment polygon: segment_image = image_from_polygon(parent_image, segment_polygon, fill=fill, transparency=transparency) # recrop into segment rectangle: segment_image = crop_image(segment_image, box=segment_bbox) # subtract offset from parent in affine coordinate transform: # (consistent with image cropping) segment_coords = { 'transform': shift_coordinates(parent_coords['transform'], np.array([-segment_bbox[0], -segment_bbox[1]])) } if 'orientation' in segment.__dict__: # region angle: PAGE @orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: segment_coords['angle'] = -(segment.get_orientation() or 0) else: segment_coords['angle'] = 0 if segment_coords['angle']: # @orientation is always absolute; if higher levels # have already rotated, then we must compensate: angle = segment_coords['angle'] - parent_coords['angle'] # map angle from (-180,180] to [0,360], and partition into multiples of 90; # but avoid unnecessary large remainders, i.e. split symmetrically: orientation = (angle + 45) % 360 orientation = orientation - (orientation % 90) skew = (angle % 360) - orientation skew = 180 - (180 - skew) % 360 # map to [-45,45] log.debug("segment '%s' has orientation=%d skew=%.2f", segment.id, orientation, skew) else: orientation = 0 skew = 0 segment_coords['angle'] = parent_coords[ 'angle'] # nothing applied yet (depends on filters) if (orientation and not 'rotated-%d' % orientation in feature_filter.split(',')): # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { 90: Image.ROTATE_90, 180: Image.ROTATE_180, 270: Image.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, np.array([0.5 * segment_xywh['w'], 0.5 * segment_xywh['h']])) segment_xywh['w'], segment_xywh[ 'h'] = adjust_canvas_to_transposition( [segment_xywh['w'], segment_xywh['h']], transposition) segment_coords['angle'] += orientation if (skew and not 'deskewed' in feature_filter.split(',')): # Rotate around center in affine coordinate transform: # (consistent with image rotation or AlternativeImage below) segment_coords['transform'] = rotate_coordinates( segment_coords['transform'], skew, np.array([0.5 * segment_xywh['w'], 0.5 * segment_xywh['h']])) segment_coords['angle'] += skew # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: segment_coords['features'] = ','.join([ feature for feature in parent_coords['features'].split(',') if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped'] ]) alternative_image = None alternative_images = segment.get_AlternativeImage() if alternative_images: # (e.g. from segment-level cropping, binarization, deskewing or despeckling) if feature_selector or feature_filter: alternative_image = None # search from the end, because by convention we always append, # and among multiple satisfactory images we want the most recent: for alternative_image in reversed(alternative_images): features = alternative_image.get_comments() if (all(feature in features for feature in feature_selector.split(',') if feature) and not any(feature in features for feature in feature_filter.split(',') if feature)): break else: alternative_image = None else: alternative_image = alternative_images[-1] features = alternative_image.get_comments() if alternative_image: log.debug("Using AlternativeImage %d (%s) for segment '%s'", alternative_images.index(alternative_image) + 1, features, segment.id) segment_image = self._resolve_image_as_pil( alternative_image.get_filename()) segment_coords['features'] = features # transpose, if (still) necessary: if (orientation and not 'rotated-%d' % orientation in segment_coords['features'] and not 'rotated-%d' % orientation in feature_filter.split(',')): log.info("Transposing %s for segment '%s' by %d°", "AlternativeImage" if alternative_image else "image", segment.id, orientation) segment_image = transpose_image(segment_image, { 90: Image.ROTATE_90, 180: Image.ROTATE_180, 270: Image.ROTATE_270 }.get(orientation)) # no default segment_coords['features'] += ',rotated-%d' % orientation if (orientation and not 'rotated-%d' % orientation in feature_filter.split(',')): # FIXME we should enforce consistency here (i.e. split into transposition # and minimal rotation) if not (segment_image.width == segment_xywh['w'] and segment_image.height == segment_xywh['h']): log.error( 'segment "%s" image (%s; %dx%d) has not been transposed properly (%dx%d) during rotation', segment.id, segment_coords['features'], segment_image.width, segment_image.height, segment_xywh['w'], segment_xywh['h']) # deskew, if (still) necessary: if (skew and not 'deskewed' in segment_coords['features'] and not 'deskewed' in feature_filter.split(',')): log.info("Rotating %s for segment '%s' by %.2f°", "AlternativeImage" if alternative_image else "image", segment.id, skew) segment_image = rotate_image(segment_image, skew, fill=fill, transparency=transparency) segment_coords['features'] += ',deskewed' if (skew and not 'deskewed' in feature_filter.split(',')): # FIXME we should enforce consistency here (i.e. rotation always reshapes, # and rescaling never happens) w_new, h_new = adjust_canvas_to_rotation( [segment_xywh['w'], segment_xywh['h']], skew) if not (w_new - 2 < segment_image.width < w_new + 2 and h_new - 2 < segment_image.height < h_new + 2): log.error( 'segment "%s" image (%s; %dx%d) has not been reshaped properly (%dx%d) during rotation', segment.id, segment_coords['features'], segment_image.width, segment_image.height, w_new, h_new) else: # FIXME: currently unavoidable with line-level dewarping (which increases height) if not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2): log.error( 'segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)', segment.id, segment_coords['features'], segment_image.width, segment_image.height, segment_xywh['w'], segment_xywh['h']) # verify constraints again: if not all(feature in segment_coords['features'] for feature in feature_selector.split(',') if feature): raise Exception( 'Found no AlternativeImage that satisfies all requirements' + 'selector="%s" in segment "%s"' % (feature_selector, segment.id)) if any(feature in segment_coords['features'] for feature in feature_filter.split(',') if feature): raise Exception( 'Found no AlternativeImage that satisfies all requirements ' + 'filter="%s" in segment "%s"' % (feature_filter, segment.id)) segment_image.format = 'PNG' # workaround for tesserocr#194 return segment_image, segment_coords
def image_from_page(self, page, page_id, fill='background', transparency=False, feature_selector='', feature_filter=''): """Extract an image for a PAGE-XML page from the workspace. Given ``page``, a PAGE PageType object, extract its PIL.Image, either from its AlternativeImage (if it exists), or from its @imageFilename (otherwise). Also crop it, if a Border exists, and rotate it, if any @orientation angle is annotated. If ``feature_selector`` and/or ``feature_filter`` is given, then select/filter among the @imageFilename image and the available AlternativeImages the last one which contains all of the selected, but none of the filtered features (i.e. @comments classes), or raise an error. (Required and produced features need not be in the same order, so ``feature_selector`` is merely a mask specifying Boolean AND, and ``feature_filter`` is merely a mask specifying Boolean OR.) If the chosen image does not have the feature "cropped" yet, but a Border exists, and unless "cropped" is being filtered, then crop it. Likewise, if the chosen image does not have the feature "deskewed" yet, but an @orientation angle is annotated, and unless "deskewed" is being filtered, then rotate it. (However, if @orientation is above the [-45°,45°] interval, then apply as much transposition as possible first, unless "rotated-90" / "rotated-180" / "rotated-270" is being filtered.) Cropping uses a polygon mask (not just the bounding box rectangle). Areas outside the polygon will be filled according to ``fill``: - if ``background`` (the default), then fill with the median color of the image; - otherwise, use the given color, e.g. ``white`` or (255,255,255). Moreover, if ``transparency`` is true, and unless the image already has an alpha channel, then add an alpha channel which is fully opaque before cropping and rotating. (Thus, only the exposed areas will be transparent afterwards, for those that can interpret alpha channels). Return a tuple: * the extracted image, * a dictionary with information about the extracted image: - ``transform``: a Numpy array with an affine transform which converts from absolute coordinates to those relative to the image, i.e. after cropping to the page's border / bounding box (if any) and deskewing with the page's orientation angle (if any) - ``angle``: the rotation/reflection angle applied to the image so far, - ``features``: the AlternativeImage @comments for the image, i.e. names of all operations that lead up to this result, * an OcrdExif instance associated with the original image. (The first two can be used to annotate a new AlternativeImage, or be passed down with ``image_from_segment``.) Example: * get a raw (colored) but already deskewed and cropped image: `` page_image, page_coords, page_image_info = workspace.image_from_page( page, page_id, feature_selector='deskewed,cropped', feature_filter='binarized,grayscale_normalized') `` """ log = getLogger('ocrd.workspace.image_from_page') page_image = self._resolve_image_as_pil(page.imageFilename) page_image_info = OcrdExif(page_image) page_coords = dict() # use identity as initial affine coordinate transform: page_coords['transform'] = np.eye(3) # interim bbox (updated with each change to the transform): page_bbox = [0, 0, page_image.width, page_image.height] page_xywh = { 'x': 0, 'y': 0, 'w': page_image.width, 'h': page_image.height } border = page.get_Border() # page angle: PAGE @orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: page_coords['angle'] = -(page.get_orientation() or 0) # map angle from (-180,180] to [0,360], and partition into multiples of 90; # but avoid unnecessary large remainders, i.e. split symmetrically: orientation = (page_coords['angle'] + 45) % 360 orientation = orientation - (orientation % 90) skew = (page_coords['angle'] % 360) - orientation skew = 180 - (180 - skew) % 360 # map to [-45,45] page_coords['angle'] = 0 # nothing applied yet (depends on filters) log.debug("page '%s' has %s orientation=%d skew=%.2f", page_id, "border," if border else "", orientation, skew) # initialize AlternativeImage@comments classes as empty: page_coords['features'] = '' alternative_image = None alternative_images = page.get_AlternativeImage() if alternative_images: # (e.g. from page-level cropping, binarization, deskewing or despeckling) if feature_selector or feature_filter: alternative_image = None # search from the end, because by convention we always append, # and among multiple satisfactory images we want the most recent: for alternative_image in reversed(alternative_images): features = alternative_image.get_comments() if (all(feature in features for feature in feature_selector.split(',') if feature) and not any(feature in features for feature in feature_filter.split(',') if feature)): break else: alternative_image = None else: alternative_image = alternative_images[-1] features = alternative_image.get_comments() if alternative_image: log.debug("Using AlternativeImage %d (%s) for page '%s'", alternative_images.index(alternative_image) + 1, features, page_id) page_image = self._resolve_image_as_pil( alternative_image.get_filename()) page_coords['features'] = features # adjust the coord transformation to the steps applied on the image, # and apply steps on the existing image in case it is missing there, # but traverse all steps (crop/reflect/rotate) in a particular order: # - existing image features take priority (in the order annotated), # - next is cropping (if necessary but not already applied), # - next is reflection (if necessary but not already applied), # - next is rotation (if necessary but not already applied). # This helps deal with arbitrary workflows (e.g. crop then deskew, # or deskew then crop), regardless of where images are generated. alternative_image_features = page_coords['features'].split(',') for i, feature in enumerate( alternative_image_features + (['cropped'] if ( border and not 'cropped' in page_coords['features'] and not 'cropped' in feature_filter.split(',')) else []) + (['rotated-%d' % orientation] if (orientation and not 'rotated-%d' % orientation in page_coords['features'] and not 'rotated-%d' % orientation in feature_filter.split(',')) else []) + (['deskewed'] if (skew and not 'deskewed' in page_coords['features'] and not 'deskewed' in feature_filter.split(',')) else []) + # not a feature to be added, but merely as a fallback position # to always enter loop at i == len(alternative_image_features) ['_check']): # image geometry vs feature consistency can only be checked # after all features on the existing AlternativeImage have # been adjusted for in the transform, and when there is a mismatch, # additional steps applied here would only repeat the respective # error message; so we only check once at the boundary between # existing and new features # FIXME we should check/enforce consistency when _adding_ AlternativeImage if (i == len(alternative_image_features) and not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)): log.error( 'page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)', page_id, page_coords['features'], page_image.width, page_image.height, page_xywh['w'], page_xywh['h']) # adjust transform to feature, possibly apply feature to image if feature == 'cropped': page_points = border.get_Coords().points log.debug( "Using explicitly set page border '%s' for page '%s'", page_points, page_id) # get polygon outline of page border: page_polygon = np.array(polygon_from_points(page_points), dtype=np.int32) page_polygon = transform_coordinates(page_polygon, page_coords['transform']) page_polygon = np.round(page_polygon).astype(np.int32) page_bbox = bbox_from_polygon(page_polygon) # get size of the page after cropping but before rotation: page_xywh = xywh_from_bbox(*page_bbox) # subtract offset in affine coordinate transform: # (consistent with image cropping or AlternativeImage below) page_coords['transform'] = shift_coordinates( page_coords['transform'], np.array([-page_xywh['x'], -page_xywh['y']])) # crop, if (still) necessary: if not 'cropped' in page_coords['features']: log.debug( "Cropping %s for page '%s' to border", "AlternativeImage" if alternative_image else "image", page_id) # create a mask from the page polygon: page_image = image_from_polygon(page_image, page_polygon, fill=fill, transparency=transparency) # recrop into page rectangle: page_image = crop_image(page_image, box=page_bbox) page_coords['features'] += ',cropped' elif feature == 'rotated-%d' % orientation: # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { 90: Image.ROTATE_90, 180: Image.ROTATE_180, 270: Image.ROTATE_270 }.get(orientation) # no default page_coords['transform'] = transpose_coordinates( page_coords['transform'], transposition, np.array([0.5 * page_xywh['w'], 0.5 * page_xywh['h']])) (page_xywh['w'], page_xywh['h']) = adjust_canvas_to_transposition( [page_xywh['w'], page_xywh['h']], transposition) page_coords['angle'] = orientation # transpose, if (still) necessary: if not 'rotated-%d' % orientation in page_coords['features']: log.info( "Transposing %s for page '%s' by %d°", "AlternativeImage" if alternative_image else "image", page_id, orientation) page_image = transpose_image( page_image, { 90: Image.ROTATE_90, 180: Image.ROTATE_180, 270: Image.ROTATE_270 }.get(orientation)) # no default page_coords['features'] += ',rotated-%d' % orientation elif feature == 'deskewed': # Rotate around center in affine coordinate transform: # (consistent with image rotation or AlternativeImage below) page_coords['transform'] = rotate_coordinates( page_coords['transform'], skew, np.array([0.5 * page_xywh['w'], 0.5 * page_xywh['h']])) page_coords['angle'] += skew # deskew, if (still) necessary: if not 'deskewed' in page_coords['features']: log.info( "Rotating %s for page '%s' by %.2f°", "AlternativeImage" if alternative_image else "image", page_id, skew) page_image = rotate_image(page_image, skew, fill=fill, transparency=transparency) page_coords['features'] += ',deskewed' (page_xywh['w'], page_xywh['h']) = adjust_canvas_to_rotation( [page_xywh['w'], page_xywh['h']], skew) # verify constraints again: if not all(feature in page_coords['features'] for feature in feature_selector.split(',') if feature): raise Exception( 'Found no AlternativeImage that satisfies all requirements ' + 'selector="%s" in page "%s"' % (feature_selector, page_id)) if any(feature in page_coords['features'] for feature in feature_filter.split(',') if feature): raise Exception( 'Found no AlternativeImage that satisfies all requirements ' + 'filter="%s" in page "%s"' % (feature_filter, page_id)) page_image.format = 'PNG' # workaround for tesserocr#194 return page_image, page_coords, page_image_info
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) padding = self.parameter['padding'] with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 # warn of existing segmentation: regions = page.get_TextRegion() if regions: min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 for region in regions: left, top, right, bottom = bbox_from_points( region.get_Coords().points) min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.warning( 'Ignoring extent from existing TextRegions: %i:%i,%i:%i', min_x, max_x, min_y, max_y) LOG.debug("Cropping with Tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 # iterate over all text blocks and compare their # bbox extent to the running min and max values for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element # ID = "region%04d" % index left, top, right, bottom = bbox_from_xywh(xywh) LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID, left, right, top, bottom) # filter region results: bin_bbox = image.getbbox() if not bin_bbox: # this does happen! LOG.info( "Ignoring region '%s' because its binarization is empty", ID) continue width = bin_bbox[2] - bin_bbox[0] if width < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.info( "Ignoring region '%s' because its width is too small (%d)", ID, width) continue height = bin_bbox[3] - bin_bbox[1] if height < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.debug( "Ignoring region '%s' because its height is too small (%d)", ID, height) continue min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # if min_x < max_x and min_y < max_y: # add padding: min_x = max(min_x - padding, 0) max_x = min(max_x + padding, page_image.width) min_y = max(min_y - padding, 0) max_y = min(max_y + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) polygon = polygon_from_bbox(min_x, min_y, max_x, max_y) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) border = BorderType( Coords=CoordsType(points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon( coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features'])) else: LOG.error("Cannot find valid extent for page '%s'", page_id) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def test_deskewing(plain_workspace): #from ocrd_utils import initLogging, setOverrideLogLevel #setOverrideLogLevel('DEBUG') size = (3000, 4000) poly = [[1403, 2573], [1560, 2573], [1560, 2598], [2311, 2598], [2311, 2757], [2220, 2757], [2220, 2798], [2311, 2798], [2311, 2908], [1403, 2908]] xywh = xywh_from_polygon(poly) bbox = bbox_from_polygon(poly) skew = 4.625 image = Image.new('L', size) image = polygon_mask(image, poly) #image.show(title='image') pixels = np.count_nonzero(np.array(image) > 0) name = 'foo0' assert plain_workspace.save_image_file(image, name, 'IMG') pcgts = page_from_file(next(plain_workspace.mets.find_files(ID=name))) page = pcgts.get_Page() region = TextRegionType( id='nonrect', Coords=CoordsType(points=points_from_polygon(poly)), orientation=-skew) page.add_TextRegion(region) page_image, page_coords, _ = plain_workspace.image_from_page(page, '') #page_image.show(title='page_image') assert list(image.getdata()) == list(page_image.getdata()) assert np.all(page_coords['transform'] == np.eye(3)) reg_image, reg_coords = plain_workspace.image_from_segment( region, page_image, page_coords, feature_filter='deskewed', fill=0) assert list(image.crop(bbox).getdata()) == list(reg_image.getdata()) assert reg_image.width == xywh['w'] == 908 assert reg_image.height == xywh['h'] == 335 assert reg_coords['transform'][0, 2] == -xywh['x'] assert reg_coords['transform'][1, 2] == -xywh['y'] # same fg after cropping to minimal bbox reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert pixels == reg_pixels # now with deskewing (test for size after recropping) reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0) #reg_image.show(title='reg_image') assert reg_image.width == 932 > xywh['w'] assert reg_image.height == 382 > xywh['h'] assert reg_coords['transform'][0, 1] != 0 assert reg_coords['transform'][1, 0] != 0 assert 'deskewed' in reg_coords['features'] # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert np.abs(pixels - reg_pixels) / pixels < 0.005 reg_array = np.array(reg_image) > 0 # now via AlternativeImage path = plain_workspace.save_image_file(reg_image, region.id + '_img', 'IMG') region.add_AlternativeImage( AlternativeImageType(filename=path, comments=reg_coords['features'])) logger_capture = FIFOIO(256) logger_handler = logging.StreamHandler(logger_capture) #logger_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) logger = logging.getLogger('ocrd_utils.crop_image') logger.addHandler(logger_handler) reg_image2, reg_coords2 = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0) #reg_image2.show(title='reg_image2') logger_output = logger_capture.getvalue() logger_capture.close() assert logger_output == '' assert reg_image2.width == reg_image.width assert reg_image2.height == reg_image.height assert np.allclose(reg_coords2['transform'], reg_coords['transform']) assert reg_coords2['features'] == reg_coords['features'] # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0) assert reg_pixels2 == reg_pixels reg_array2 = np.array(reg_image2) > 0 assert 0.98 < np.sum(reg_array == reg_array2) / reg_array.size <= 1.0
def process(self): """Resegment lines of the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the line level. Next, get each region image according to the layout annotation (from the alternative image of the region, or by cropping via coordinates into the higher-level image), and compute a new line segmentation from that (as a label mask). Then for each line within the region, find the label with the largest foreground area in the binarized image within the annotated polygon (or rectangle) of the line. Unless its relative area is too small, or its center is far off, convert that label's mask into a polygon outline, intersect with the old polygon, and find the contour of that segment. Annotate the result as new coordinates of the line. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-RESEG`` along with further identification of the input element. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.OcropyResegment') # This makes best sense for bad/coarse line segmentation, like current GT # or as postprocessing for bbox-only steps. # Most notably, it can convert rectangles to polygons (polygonalization). # It depends on a decent line segmentation from ocropy though. So it # _should_ ideally be run after deskewing (on the page or region level), # _must_ be run after binarization (on page or region level). Also, the # method's accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). threshold = self.parameter['min_fraction'] margin = self.parameter['extend_margins'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 LOG.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 regions = page.get_AllRegions(classes=['Text']) if not regions: LOG.warning('Page "%s" contains no text regions', page_id) for region in regions: lines = region.get_TextLine() if not lines: LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) continue if len(lines) == 1: LOG.warning('Page "%s" region "%s" contains only one line', page_id, region.id) continue region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_selector='binarized') region_array = pil2array(region_image) #region_array, _ = common.binarize(region_array, maxskew=0) # just in case still raw region_bin = np.array(region_array <= midrange(region_array), np.bool) report = check_region(region_bin, zoom) try: if report: raise Exception(report) region_labels, _, _, _, _, _ = compute_segmentation( region_bin, zoom=zoom) except Exception as err: LOG.warning( 'Cannot line-segment page "%s" region "%s": %s', page_id, region.id, err) # fallback option 1: borderclean # label margins vs interior, but with the interior # extended into the margin by its connected components # to remove noise from neighbouring regions: #region_labels = borderclean_bin(region_bin, margin=round(4/zoom)) + 1 # too dangerous, because we risk losing dots from i or punctuation; # fallback option2: only extend_margins # instead, just provide a uniform label, so at least we get # to extend the polygon margins: #region_labels = np.ones_like(region_bin) # fallback option3: keep unchanged continue for line in lines: if line.get_AlternativeImage(): # get cropped line image: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') LOG.debug("Using AlternativeImage (%s) for line '%s'", line_xywh['features'], line.id) # crop region arrays accordingly: line_polygon = coordinates_of_segment( line, region_image, region_xywh) line_bbox = bbox_from_polygon(line_polygon) line_labels = region_labels[line_bbox[1]:line_bbox[3], line_bbox[0]:line_bbox[2]] line_bin = region_bin[line_bbox[1]:line_bbox[3], line_bbox[0]:line_bbox[2]] # get polygon in relative (line) coordinates: line_polygon = coordinates_of_segment( line, line_image, line_xywh) line_polygon = resegment(line_polygon, line_labels, line_bin, line.id, extend_margins=margin, threshold_relative=threshold) if line_polygon is None: continue # not good enough – keep # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, line_image, line_xywh) else: # get polygon in relative (region) coordinates: line_polygon = coordinates_of_segment( line, region_image, region_xywh) line_polygon = resegment(line_polygon, region_labels, region_bin, line.id, extend_margins=margin, threshold_relative=threshold) if line_polygon is None: continue # not good enough – keep # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, region_image, region_xywh) # annotate result: line.get_Coords().points = points_from_polygon( line_polygon) # create new image: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') # update METS (add the image file): file_path = self.workspace.save_image_file( line_image, file_id=file_id + '_' + region.id + '_' + line.id + '.IMG-RESEG', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): line.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=region_xywh['features'])) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)