def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested ``level-of-operation``. Next, get each segment image according to the layout annotation (by cropping via coordinates into the higher-level image), as well as all its neighbours', binarize them (without deskewing), and make a connected component analysis. (Segments must not already have AlternativeImage annotated, otherwise they will be skipped.) Then, for each section of overlap with a neighbour, re-assign components which are only contained in the neighbour by clipping them to white (background), and export the (final) result as image file. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CLIP`` along with further identification of the input element. Reference each new image in the AlternativeImage of the element. Produce a new output file by serialising the resulting hierarchy. """ # This makes best sense for overlapping segmentation, like current GT # or Tesseract layout analysis. Most notably, it can suppress graphics # and separators within or across a region or line. It _should_ ideally # be run after binarization (on page level for region-level clipping, # and on the region level for line-level clipping), because the # connected component analysis after implicit binarization could be # suboptimal, and the explicit binarization after clipping could be, # too. However, region-level clipping _must_ be run before region-level # deskewing, because that would make segments incomensurable with their # neighbours. LOG = getLogger('processor.OcropyClip') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') if self.parameter['dpi'] > 0: zoom = 300.0/self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 LOG.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 regions = list(page.get_TextRegion()) num_texts = len(regions) regions += ( page.get_AdvertRegion() + page.get_ChartRegion() + page.get_ChemRegion() + page.get_GraphicRegion() + page.get_ImageRegion() + page.get_LineDrawingRegion() + page.get_MathsRegion() + page.get_MusicRegion() + page.get_NoiseRegion() + page.get_SeparatorRegion() + page.get_TableRegion() + page.get_UnknownRegion()) if not num_texts: LOG.warning('Page "%s" contains no text regions', page_id) background = ImageStat.Stat(page_image) # workaround for Pillow#4925 if len(background.bands) > 1: background = tuple(background.median) else: background = background.median[0] if level == 'region': background_image = Image.new(page_image.mode, page_image.size, background) page_array = pil2array(page_image) page_bin = np.array(page_array <= midrange(page_array), np.uint8) # in absolute coordinates merely for comparison/intersection shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] # in relative coordinates for mask/cropping polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] for i, polygon in enumerate(polygons[num_texts:], num_texts): # for non-text regions, extend mask by 3 pixels in each direction # to ensure they do not leak components accidentally # (accounts for bad cropping of such regions in GT): polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open polygons[i] = polygon masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] for i, region in enumerate(regions): if i >= num_texts: break # keep non-text regions unchanged if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). LOG.warning('Page "%s" region "%s" already contains image data: skipping', page_id, region.id) continue shape = prep(shapes[i]) neighbours = [(regionj, maskj) for shapej, regionj, maskj in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) if shape.intersects(shapej)] if neighbours: self.process_segment(region, masks[i], polygons[i], neighbours, background_image, page_image, page_coords, page_bin, input_file.pageId, file_id + '_' + region.id) continue # level == 'line': lines = region.get_TextLine() if not lines: LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) continue region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') background_image = Image.new(region_image.mode, region_image.size, background) region_array = pil2array(region_image) region_bin = np.array(region_array <= midrange(region_array), np.uint8) # in absolute coordinates merely for comparison/intersection shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] # in relative coordinates for mask/cropping polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] for j, line in enumerate(lines): if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', page_id, region.id, line.id) continue shape = prep(shapes[j]) neighbours = [(linej, maskj) for shapej, linej, maskj in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) if shape.intersects(shapej)] if neighbours: self.process_segment(line, masks[j], polygons[j], neighbours, background_image, region_image, region_coords, region_bin, input_file.pageId, file_id + '_' + region.id + '_' + line.id) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def test_make_file_id_simple(self): f = create_ocrd_file('MAX', ID="MAX_0012") self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0012')
def test_make_file_id_605(self): """https://github.com/OCR-D/core/pull/605""" mets = OcrdMets.empty_mets() f = mets.add_file('GRP1', ID='FOO_0001', pageId='phys0001') f = mets.add_file('GRP2', ID='FOO_0002', pageId='phys0002') self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0001')
def process(self): """Detect font shapes via rule-based OCR with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the line level. Set up Tesseract to recognise each word's image (either from AlternativeImage or cropping the bounding box rectangle and masking it from the polygon outline) in word mode and with the ``osd`` model. Query the result's font attributes and write them into the word element's ``TextStyle``. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrFontShape') LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) model = self.parameter['model'] if model not in get_languages()[1]: raise Exception( "model " + model + " (needed for font style detection) is not installed") with PyTessBaseAPI( path=TESSDATA_PREFIX, #oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD or WordFontAttributes! oem=OEM. TESSERACT_ONLY, # legacy required for OSD or WordFontAttributes! lang=model) as tessapi: LOG.info( "Using model '%s' in %s for recognition at the word level", model, get_languages()[0]) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Processing page '%s'", page_id) regions = page.get_AllRegions(classes=['Text']) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) else: self._process_regions(tessapi, regions, page_image, page_coords) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): LOG = getLogger('OcrdAnybaseocrTiseg') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] model = None if self.parameter['use_deeplr']: model_weights = self.parameter['seg_weights'] if not Path(model_weights).is_file(): LOG.error("""\ Segementation model weights file was not found at '%s'. Make sure the `seg_weights` parameter points to the local model weights path. """ % model_weights) sys.exit(1) #model = resnet50_unet(n_classes=self.parameter['classes'], input_height=self.parameter['height'], input_width=self.parameter['width']) #model.load_weights(model_weights) model = load_model(model_weights) LOG.info('Segmentation Model loaded') for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) if self.parameter['use_deeplr']: page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped') else: page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized,deskewed,cropped') if oplevel == 'page': self._process_segment(page_image, page, page_xywh, page_id, input_file, n, model) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), )
def process(self): """Replace everything below the page level with another annotation. Open and deserialize PAGE input files from both input file groups, then go to the page hierarchy level. Replace all regions (and their reading order) from the page of the first input file group with all regions from the page of the second input file group. Keep page-level annotations unchanged (i.e. Border, orientation, type, AlternativeImage etc). If ``transform_coordinates`` is true, then also retrieve the coordinate transform of the (cropped, deskewed, dewarped) page from the first input fileGrp, and use it to adjust all segment coordinates from the second input fileGrp, accordingly. (This assumes both are consistent, i.e. the second input was derived from the first input via ``ocrd-segment-replace-original`` or similar.) Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.ReplacePage') assert_file_grp_cardinality(self.input_file_grp, 2, 'original, page') assert_file_grp_cardinality(self.output_file_grp, 1) adapt_coords = self.parameter['transform_coordinates'] # collect input file tuples ifts = self.zip_input_files() # input file tuples # process input file tuples for n, ift in enumerate(ifts): input_file, page_file = ift if input_file is None or page_file is None: continue page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() pcgts2 = page_from_file(self.workspace.download_file(page_file)) page2 = pcgts2.get_Page() # adjust all coordinates (recursively) if adapt_coords: try: _, page_coords, _ = self.workspace.image_from_page( page, page_id) for region in page2.get_AllRegions(): region_coords = region.get_Coords() region_polygon = polygon_from_points( region_coords.points) region_polygon = coordinates_for_segment( region_polygon, None, page_coords) region_coords.set_points( points_from_polygon(region_polygon)) ensure_consistent(region) if isinstance(region, TextRegionType): for line in region.get_TextLine(): line_coords = line.get_Coords() line_polygon = polygon_from_points( line_coords.points) line_polygon = coordinates_for_segment( line_polygon, None, page_coords) line_coords.set_points( points_from_polygon(line_polygon)) ensure_consistent(line) for word in line.get_Word(): word_coords = word.get_Coords() word_polygon = polygon_from_points( word_coords.points) word_polygon = coordinates_for_segment( word_polygon, None, page_coords) word_coords.set_points( points_from_polygon(word_polygon)) ensure_consistent(word) for glyph in word.get_Glyph(): glyph_coords = glyph.get_Coords() glyph_polygon = polygon_from_points( glyph_coords.points) glyph_polygon = coordinates_for_segment( glyph_polygon, None, page_coords) glyph_coords.set_points( points_from_polygon(glyph_polygon)) ensure_consistent(glyph) except: LOG.error('invalid coordinates on page %s', page_id) continue # replace all regions page.set_ReadingOrder(page2.get_ReadingOrder()) page.set_TextRegion(page2.get_TextRegion()) page.set_ImageRegion(page2.get_ImageRegion()) page.set_LineDrawingRegion(page2.get_LineDrawingRegion()) page.set_GraphicRegion(page2.get_GraphicRegion()) page.set_TableRegion(page2.get_TableRegion()) page.set_ChartRegion(page2.get_ChartRegion()) page.set_MapRegion(page2.get_MapRegion()) page.set_SeparatorRegion(page2.get_SeparatorRegion()) page.set_MathsRegion(page2.get_MathsRegion()) page.set_ChemRegion(page2.get_ChemRegion()) page.set_MusicRegion(page2.get_MusicRegion()) page.set_AdvertRegion(page2.get_AdvertRegion()) page.set_NoiseRegion(page2.get_NoiseRegion()) page.set_UnknownRegion(page2.get_UnknownRegion()) page.set_CustomRegion(page2.get_CustomRegion()) # update METS (add the PAGE file): file_id = make_file_id(page_file, self.output_file_grp) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): """Perform OCR post-correction with encoder-attention-decoder ANN on the workspace. Open and deserialise PAGE input files, then iterate over the element hierarchy down to the requested `textequiv_level`, making sequences of TextEquiv objects as lists of lines. Concatenate their string values, obeying rules of implicit whitespace, and map the string positions where the objects start. Next, transcode the input lines into output lines in parallel, and use the retrieved soft alignment scores to calculate hard alignment paths between input and output string via Viterbi decoding. Then use those to map back the start positions and overwrite each TextEquiv with its new content, paying special attention to whitespace: Distribute edits such that whitespace objects cannot become more than whitespace (or be deleted) and that non-whitespace objects must not start or end with whitespace (but may contain new whitespace in the middle). Subsequently, unless processing on the `line` level, make the Word segmentation consistent with that result again: merge around deleted whitespace tokens and split at whitespace inside non-whitespace tokens. Finally, make the levels above `textequiv_level` consistent with that textual result (via concatenation joined by whitespace). Produce new output files by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # Dragging Word/TextLine references along in all lists besides TextEquiv # is necessary because the generateDS version of the PAGE-XML model # has no references upwards in the hierarchy (from TextEquiv to containing # elements, from Glyph/Word/TextLine to Word/TextLine/TextRegion), and # its classes are not hashable. level = self.parameter['textequiv_level'] for n, input_file in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = input_file.pageId or input_file.ID # (PageType has no id) self.logger.info("Correcting text in page '%s' at the %s level", page_id, level) # annotate processing metadata: self.add_metadata(pcgts) # get textequiv references for all lines: # FIXME: conf with TextEquiv alternatives line_sequences = _page_get_line_sequences_at(level, pcgts) # concatenate to strings and get dict of start positions to refs: input_lines, conf, textequiv_starts, word_starts, textline_starts = ( _line_sequences2string_sequences( self.s2s.mapping[0], line_sequences, charmap=self.parameter['charmap'])) # correct string and get input-output alignment: # FIXME: split into self.batch_size chunks output_lines, output_probs, output_scores, alignments = ( self.s2s.correct_lines(input_lines, conf, fast=self.parameter['fast_mode'], greedy=self.parameter['fast_mode'])) # re-align (from alignment scores) and overwrite the textequiv references: for (input_line, output_line, output_prob, output_score, alignment, textequivs, words, textlines) in zip(input_lines, output_lines, output_probs, output_scores, alignments, textequiv_starts, word_starts, textline_starts): self.logger.debug('"%s" -> "%s"', input_line.rstrip('\n'), output_line.rstrip('\n')) # convert soft scores (seen from output) to hard path (seen from input): #realignment = _alignment2path(alignment, len(input_line), len(output_line), # 1. / self.s2s.voc_size) # create hard path via minimal edit distance: realignment, distance = _alignment_path( input_line, output_line) # overwrite TextEquiv references: new_sequence = _update_sequence(input_line, output_line, output_prob, output_score, realignment, textequivs, words, textlines) # update Word segmentation: if level != 'line': _resegment_sequence(new_sequence, level) self.logger.info( 'corrected line with %d elements, ppl: %.3f, CER: %.1f%%', len(new_sequence), np.exp(output_score), distance * 100) # make higher levels consistent again: page_update_higher_textequiv_levels(level, pcgts) # write back result to new annotation: file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) file_path = os.path.join(self.output_file_grp, file_id + '.xml') self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts))
def process(self): """Extract word images and texts from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the word level. Extract an image for each word (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. Apply ``feature_filter`` (a comma-separated list of image features, cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip specific features when retrieving derived images. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. Create a JSON file with: * the IDs of the word and its parents, * the word's text content, * the word's coordinates relative to the line image, * the word's absolute coordinates, * the word's TextStyle (if any), * the word's @production (if any), * the word's @readingDirection (if any), * the word's @primaryScript (if any), * the word's @language (if any), * the word's AlternativeImage/@comments (features), * the parent textregion's @type, * the page's @type, * the page's DPI value. Create a plain text file for the text content, too. Write all files in the directory of the output file group, named like so: * ID + '.raw.png': word image (if the workflow provides raw images) * ID + '.bin.png': word image (if the workflow provides binarized images) * ID + '.nrm.png': word image (if the workflow provides grayscale-normalized images) * ID + '.json': word metadata. * ID + '.gt.txt': word text. (This is intended for training and evaluation of OCR models.) """ LOG = getLogger('processor.ExtractWords') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None ptype = page.get_type() regions = itertools.chain.from_iterable([page.get_TextRegion()] + [ subregion.get_TextRegion() for subregion in page.get_TableRegion() ]) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) rtype = region.get_type() lines = region.get_TextLine() if not lines: LOG.warning("Region '%s' contains no text lines", region.id) for line in lines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) words = line.get_Word() if not words: LOG.warning("Line '%s' contains no words", line.id) for word in words: word_image, word_coords = self.workspace.image_from_segment( word, line_image, line_coords, feature_filter=self.parameter['feature_filter'], transparency=self.parameter['transparency']) lpolygon_rel = coordinates_of_segment( word, word_image, word_coords).tolist() lpolygon_abs = polygon_from_points( word.get_Coords().points) ltext = word.get_TextEquiv() if not ltext: LOG.warning("Word '%s' contains no text content", word.id) ltext = '' else: ltext = ltext[0].Unicode lstyle = word.get_TextStyle() or line.get_TextStyle( ) or region.get_TextStyle() if lstyle: lstyle = { 'fontFamily': lstyle.fontFamily, 'fontSize': lstyle.fontSize, 'xHeight': lstyle.xHeight, 'kerning': lstyle.kerning, 'serif': lstyle.serif, 'monospace': lstyle.monospace, 'bold': lstyle.bold, 'italic': lstyle.italic, 'smallCaps': lstyle.smallCaps, 'letterSpaced': lstyle.letterSpaced, 'strikethrough': lstyle.strikethrough, 'underlined': lstyle.underlined, 'underlineStyle': lstyle.underlineStyle, 'subscript': lstyle.subscript, 'superscript': lstyle.superscript } lfeatures = word_coords['features'] description = { 'word.ID': word.id, 'text': ltext, 'style': lstyle, 'production': (word.get_production() or line.get_production() or region.get_production()), 'readingDirection': (word.get_readingDirection() or line.get_readingDirection() or region.get_readingDirection() or page.get_readingDirection()), 'primaryScript': (word.get_primaryScript() or line.get_primaryScript() or region.get_primaryScript() or page.get_primaryScript()), 'language': (word.get_language() or line.get_primaryLanguage() or region.get_primaryLanguage() or page.get_primaryLanguage()), 'features': lfeatures, 'DPI': dpi, 'coords_rel': lpolygon_rel, 'coords_abs': lpolygon_abs, 'line.ID': line.id, 'region.ID': region.id, 'region.type': rtype, 'page.ID': page_id, 'page.type': ptype, 'file_grp': self.input_file_grp, 'METS.UID': self.workspace.mets.unique_identifier } if 'binarized' in lfeatures: extension = '.bin' elif 'grayscale_normalized' in lfeatures: extension = '.nrm' else: extension = '.raw' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( word_image, file_id + '_' + region.id + '_' + line.id + '_' + word.id + extension, self.output_file_grp, page_id=page_id, mimetype=self.parameter['mimetype']) file_path = file_path.replace( extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') json.dump(description, open(file_path, 'w')) file_path = file_path.replace('.json', '.gt.txt') with open(file_path, 'wb') as f: f.write((ltext + '\n').encode('utf-8'))
def process(self): """Deskew the regions of the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the TextRegion level. Next, for each file, crop each region image according to the layout annotation (via coordinates into the higher-level image, or from the alternative image), and determine the threshold for binarization and the deskewing angle of the region (up to ``maxskew``). Annotate the angle in the region. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-DESKEW`` along with further identification of the input element. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.OcropyDeskew') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_coords, _ = self.workspace.image_from_page( page, page_id, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed' if level == 'page' else '') if level == 'page': self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, input_file.pageId, file_id) else: if level == 'table': regions = page.get_TableRegion() else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: LOG.warning('Page "%s" contains no text regions', page_id) for region in regions: # process region: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed') self._process_segment(region, region_image, region_coords, "region '%s'" % region.id, input_file.pageId, file_id + '_' + region.id) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): """Resegment lines of the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the line level. Next, get each region image according to the layout annotation (from the alternative image of the region, or by cropping via coordinates into the higher-level image), and compute a new line segmentation from that (as a label mask). Then for each line within the region, find the label with the largest foreground area in the binarized image within the annotated polygon (or rectangle) of the line. Unless its relative area is too small, or its center is far off, convert that label's mask into a polygon outline, intersect with the old polygon, and find the contour of that segment. Annotate the result as new coordinates of the line. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-RESEG`` along with further identification of the input element. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.OcropyResegment') # This makes best sense for bad/coarse line segmentation, like current GT # or as postprocessing for bbox-only steps. # Most notably, it can convert rectangles to polygons (polygonalization). # It depends on a decent line segmentation from ocropy though. So it # _should_ ideally be run after deskewing (on the page or region level), # _must_ be run after binarization (on page or region level). Also, the # method's accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). threshold = self.parameter['min_fraction'] margin = self.parameter['extend_margins'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 LOG.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 regions = page.get_AllRegions(classes=['Text']) if not regions: LOG.warning('Page "%s" contains no text regions', page_id) for region in regions: lines = region.get_TextLine() if not lines: LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) continue if len(lines) == 1: LOG.warning('Page "%s" region "%s" contains only one line', page_id, region.id) continue region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_selector='binarized') region_array = pil2array(region_image) #region_array, _ = common.binarize(region_array, maxskew=0) # just in case still raw region_bin = np.array(region_array <= midrange(region_array), np.bool) report = check_region(region_bin, zoom) try: if report: raise Exception(report) region_labels, _, _, _, _, _ = compute_segmentation( region_bin, zoom=zoom) except Exception as err: LOG.warning( 'Cannot line-segment page "%s" region "%s": %s', page_id, region.id, err) # fallback option 1: borderclean # label margins vs interior, but with the interior # extended into the margin by its connected components # to remove noise from neighbouring regions: #region_labels = borderclean_bin(region_bin, margin=round(4/zoom)) + 1 # too dangerous, because we risk losing dots from i or punctuation; # fallback option2: only extend_margins # instead, just provide a uniform label, so at least we get # to extend the polygon margins: #region_labels = np.ones_like(region_bin) # fallback option3: keep unchanged continue for line in lines: if line.get_AlternativeImage(): # get cropped line image: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') LOG.debug("Using AlternativeImage (%s) for line '%s'", line_xywh['features'], line.id) # crop region arrays accordingly: line_polygon = coordinates_of_segment( line, region_image, region_xywh) line_bbox = bbox_from_polygon(line_polygon) line_labels = region_labels[line_bbox[1]:line_bbox[3], line_bbox[0]:line_bbox[2]] line_bin = region_bin[line_bbox[1]:line_bbox[3], line_bbox[0]:line_bbox[2]] # get polygon in relative (line) coordinates: line_polygon = coordinates_of_segment( line, line_image, line_xywh) line_polygon = resegment(line_polygon, line_labels, line_bin, line.id, extend_margins=margin, threshold_relative=threshold) if line_polygon is None: continue # not good enough – keep # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, line_image, line_xywh) else: # get polygon in relative (region) coordinates: line_polygon = coordinates_of_segment( line, region_image, region_xywh) line_polygon = resegment(line_polygon, region_labels, region_bin, line.id, extend_margins=margin, threshold_relative=threshold) if line_polygon is None: continue # not good enough – keep # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, region_image, region_xywh) # annotate result: line.get_Coords().points = points_from_polygon( line_polygon) # create new image: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') # update METS (add the image file): file_path = self.workspace.save_image_file( line_image, file_id=file_id + '_' + region.id + '_' + line.id + '.IMG-RESEG', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): line.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=region_xywh['features'])) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points(border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 bounds = self.estimate_bounds(page, page_image, tessapi, zoom) self.process_page(page, page_image, page_xywh, bounds, file_id, input_file.pageId) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs region segmentation by reading mask images in pseudo-colour. Open and deserialize each PAGE input file (or generate from image input file) from the first input file group, as well as mask image file from the second. Then iterate over all connected (equally colored) mask segments and compute convex hull contours for them. Convert them to polygons, and look up their color value in ``colordict`` to instantiate the appropriate region types (optionally with subtype). Instantiate and annotate regions accordingly. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.ImportImageSegmentation') assert_file_grp_cardinality(self.input_file_grp, 2, 'base and mask') assert_file_grp_cardinality(self.output_file_grp, 1) colordict = self.parameter['colordict'] typedict = { "TextRegion": TextTypeSimpleType, "GraphicRegion": GraphicsTypeSimpleType, "ChartType": ChartTypeSimpleType } # collect input file tuples ifts = self.zip_input_files() # input file tuples # process input file tuples for ift in ifts: input_file, segmentation_file = ift LOG.info("processing page %s", input_file.pageId) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # import mask image segmentation_filename = self.workspace.download_file( segmentation_file).local_filename with pushd_popd(self.workspace.directory): segmentation_pil = Image.open(segmentation_filename) has_alpha = segmentation_pil.mode == 'RGBA' if has_alpha: colorformat = "%08X" else: colorformat = "%06X" if segmentation_pil.mode != 'RGB': segmentation_pil = segmentation_pil.convert('RGB') # convert to array segmentation_array = np.array(segmentation_pil) # collapse 3 color channels segmentation_array = segmentation_array.dot( np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:]) # partition mapped colors vs background colors = np.unique(segmentation_array) bgcolors = [] for i, color in enumerate(colors): colorname = colorformat % color if (colorname not in colordict or not colordict[colorname]): #raise Exception("Unknown color %s (not in colordict)" % colorname) LOG.info("Ignoring background color %s", colorname) bgcolors.append(i) background = np.zeros_like(segmentation_array, np.uint8) if bgcolors: for i in bgcolors: background += np.array(segmentation_array == colors[i], np.uint8) colors = np.delete(colors, bgcolors, 0) # iterate over mask for each mapped color/class regionno = 0 for color in colors: # get region (sub)type colorname = colorformat % color classname = colordict[colorname] regiontype = None custom = None if ":" in classname: classname, regiontype = classname.split(":") if classname in typedict: typename = membername(typedict[classname], regiontype) if typename == regiontype: # not predefined in PAGE: use other + custom custom = "subtype:%s" % regiontype regiontype = "other" else: custom = "subtype:%s" % regiontype if classname + "Type" not in globals(): raise Exception( "Unknown class '%s' for color %s in colordict" % (classname, colorname)) classtype = globals()[classname + "Type"] if classtype is BorderType: # mask from all non-background regions classmask = 1 - background else: # mask from current color/class classmask = np.array(segmentation_array == color, np.uint8) if not np.count_nonzero(classmask): continue # now get the contours and make polygons for them contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for contour in contours: # (could also just take bounding boxes to avoid islands/inclusions...) area = cv2.contourArea(contour) # filter too small regions area_pct = area / np.prod(segmentation_array.shape) * 100 if area < 100 and area_pct < 0.1: LOG.warning( 'ignoring contour of only %.1f%% area for %s', area_pct, classname) continue LOG.info('found region %s:%s:%s with area %.1f%%', classname, regiontype or '', custom or '', area_pct) # simplify shape poly = cv2.approxPolyDP( contour, 2, False)[:, 0, ::] # already ordered x,y if len(poly) < 4: LOG.warning( 'ignoring contour of only %d points (area %.1f%%) for %s', len(poly), area_pct, classname) continue if classtype is BorderType: # add Border page.set_Border( BorderType(Coords=CoordsType( points=points_from_polygon(poly)))) break else: # instantiate region regionno += 1 region = classtype( id="region_%d" % regionno, type_=regiontype, custom=custom, Coords=CoordsType( points=points_from_polygon(poly))) # add region getattr(page, 'add_%s' % classname)(region) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n): LOG = getLogger('OcrdAnybaseocrBinarizer') raw = ocrolib.pil2array(page_image) if len(raw.shape) > 2: raw = np.mean(raw, 2) raw = raw.astype("float64") # perform image normalization image = raw - amin(raw) if amax(image) == amin(image): LOG.info("# image is empty: %s" % (page_id)) return image /= amax(image) # check whether the image is already effectively binarized if self.parameter['gray']: extreme = 0 else: extreme = (np.sum(image < 0.05) + np.sum(image > 0.95)) * 1.0 / np.prod(image.shape) if extreme > 0.95: comment = "no-normalization" flat = image else: comment = "" # if not, we need to flatten it by estimating the local whitelevel LOG.info("Flattening") m = interpolation.zoom(image, self.parameter['zoom']) m = filters.percentile_filter(m, self.parameter['perc'], size=(self.parameter['range'], 2)) m = filters.percentile_filter(m, self.parameter['perc'], size=(2, self.parameter['range'])) m = interpolation.zoom(m, 1.0 / self.parameter['zoom']) if self.parameter['debug'] > 0: clf() imshow(m, vmin=0, vmax=1) ginput(1, self.parameter['debug']) w, h = minimum(array(image.shape), array(m.shape)) flat = clip(image[:w, :h] - m[:w, :h] + 1, 0, 1) if self.parameter['debug'] > 0: clf() imshow(flat, vmin=0, vmax=1) ginput(1, self.parameter['debug']) # estimate low and high thresholds LOG.info("Estimating Thresholds") d0, d1 = flat.shape o0, o1 = int(self.parameter['bignore'] * d0), int( self.parameter['bignore'] * d1) est = flat[o0:d0 - o0, o1:d1 - o1] if self.parameter['escale'] > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = self.parameter['escale'] v = est - filters.gaussian_filter(est, e * 20.0) v = filters.gaussian_filter(v**2, e * 20.0)**0.5 v = (v > 0.3 * amax(v)) v = morphology.binary_dilation(v, structure=ones((int(e * 50), 1))) v = morphology.binary_dilation(v, structure=ones((1, int(e * 50)))) if self.parameter['debug'] > 0: imshow(v) ginput(1, self.parameter['debug']) est = est[v] lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo']) hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi']) # rescale the image to get the gray scale image LOG.info("Rescaling") flat -= lo flat /= (hi - lo) flat = clip(flat, 0, 1) if self.parameter['debug'] > 0: imshow(flat, vmin=0, vmax=1) ginput(1, self.parameter['debug']) binarized = 1 * (flat > self.parameter['threshold']) # output the normalized grayscale and the thresholded images # print_info("%s lo-hi (%.2f %.2f) angle %4.1f %s" % (fname, lo, hi, angle, comment)) LOG.info("%s lo-hi (%.2f %.2f) %s" % (page_id, lo, hi, comment)) LOG.info("writing") if self.parameter['debug'] > 0 or self.parameter['show']: clf() gray() imshow(binarized) ginput(1, max(0.1, self.parameter['debug'])) page_xywh['features'] += ',binarized' bin_array = array(255 * (binarized > ocrolib.midrange(binarized)), 'B') bin_image = ocrolib.array2pil(bin_array) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( bin_image, file_id + '-IMG', page_id=page_id, file_grp=self.output_file_grp) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))
def process(self): log = getLogger('processor.OcrdSbbTextlineDetectorRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, input_file) file_id = make_file_id(input_file, self.output_file_grp) # Process the files try: os.mkdir(self.output_file_grp) except FileExistsError: pass pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = \ self.workspace.image_from_page( page, page_id, feature_filter='cropped,binarized,grayscale_normalized' ) with tempfile.TemporaryDirectory() as tmp_dirname: # Save the image image_file = tempfile.mkstemp(dir=tmp_dirname, suffix='.png')[1] page_image.save(image_file) # Segment the image model = self.parameter['model'] x = textline_detector(image_file, tmp_dirname, file_id, model) x.run() # Read segmentation results tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml' tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename, silence=True) tmp_page = tmp_pcgts.get_Page() # Create a new PAGE file from the input file pcgts.set_pcGtsId(file_id) # Merge results → PAGE file # 1. Border if page.get_Border(): log.warning("Removing existing page border") page.set_Border(None) # We need to translate the coordinates: text_border = adapt_coords(tmp_page.get_Border(), page, page_coords) if text_border is None: # intersection is empty (border outside of rotated original image) log.warning("new border would be empty, skipping") else: page.set_Border(text_border) # 2. ReadingOrder if page.get_ReadingOrder(): log.warning("Removing existing regions' reading order") page.set_ReadingOrder(tmp_page.get_ReadingOrder()) # 3. TextRegion # FIXME: what about table and image regions? if page.get_TextRegion(): log.warning("Removing existing text regions") # We need to translate the coordinates: text_regions_new = [] for text_region in tmp_page.get_TextRegion(): text_region = adapt_coords(text_region, page, page_coords) if text_region is None: # intersection is empty (polygon outside of above border) log.warning( "new text region polygon would be empty, skipping") continue text_regions_new.append(text_region) text_lines_new = [] for text_line in text_region.get_TextLine(): text_line = adapt_coords(text_line, text_region, page_coords) if text_line is None: # intersection is empty (polygon outside of region) log.warning( "new text line polygon would be empty, skipping") continue text_lines_new.append(text_line) text_region.set_TextLine(text_lines_new) page.set_TextRegion(text_regions_new) # Save metadata about this operation self.add_metadata(pcgts) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=page_id, mimetype='application/vnd.prima.page+xml', local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', content=ocrd_models.ocrd_page.to_xml(pcgts))
def process(self): """Performs region segmentation by reading from COCO annotations. Open and deserialize the COCO JSON file from the second input file group. (It lists region categories/subtypes, file names and segmentations for all pages.) Open and deserialize each PAGE input file (or generate from image input file) from the first input file group. Now find this page in COCO: - try to match the PAGE ``imageFilename`` or METS file path matches to some COCO ``file_name``, otherwise - try to match the numeric part of the METS physical page ID to some COCO ``id``, otherwise - skip with an error. Then create and add a region for each ``segmentation``, converting its polygon to coordinate points and its COCO category to a region type (and subtype), either for a PubLayNet classification or PAGE classification (as produced by ocrd-segment-extract-pages), as indicated by ``source``. Produce a new output file by serialising the resulting hierarchy. Afterwards, if there are still COCO images left unaccounted for (i.e. without corresponding input files), then show a warning. """ LOG = getLogger('processor.ImportCOCOSegmentation') # Load JSON assert_file_grp_cardinality(self.input_file_grp, 2, 'base and COCO') # pylint: disable=attribute-defined-outside-init self.input_file_grp, coco_grp = self.input_file_grp.split(',') # pylint: disable=attribute-defined-outside-init if not self.input_files: LOG.warning('No input files to process') return if coco_grp in self.workspace.mets.file_groups: try: cocofile = next( f for f in self.workspace.mets.find_files(fileGrp=coco_grp) # if f.mimetype == 'application/json' and not f.pageId if not f.pageId) except StopIteration: raise Exception( "no non-page-specific file in second file group (COCO file)", coco_grp) cocofile = self.workspace.download_file(cocofile).local_filename elif os.path.isfile(coco_grp): cocofile = coco_grp else: raise Exception("file not found in second file group (COCO file)", coco_grp) LOG.info('Loading COCO annotations from "%s" into memory...', cocofile) with open(cocofile, 'r') as inp: coco = json.load(inp) LOG.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) coco_source = 'PubLayNet' # Convert to usable dicts # classes: categories = dict() subcategories = dict() for cat in coco['categories']: if cat['source'] == 'PAGE': coco_source = 'PAGE' if 'supercategory' in cat and cat['supercategory']: categories[cat['id']] = cat['supercategory'] subcategories[cat['id']] = cat['name'] else: categories[cat['id']] = cat['name'] # images and annotations: images_by_id = dict() images_by_filename = dict() for image in coco['images']: images_by_id[image['id']] = image images_by_filename[image['file_name']] = image for annotation in coco['annotations']: image = images_by_id[annotation['image_id']] regions = image.setdefault('regions', list()) regions.append(annotation) del coco LOG.info('Converting %s annotations into PAGE-XML', coco_source) for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID num_page_id = int(page_id.strip(page_id.strip("0123456789"))) LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # find COCO image if page.imageFilename in images_by_filename: image = images_by_filename[page.imageFilename] elif num_page_id in images_by_id: image = images_by_id[num_page_id] else: LOG.error('Page "%s" / file "%s" not found in COCO', page_id, page.imageFilename) # todo: maybe we should at least write the (unchanged) output PAGE? continue if image['width'] != page.imageWidth: LOG.error( 'Page "%s" width %d does not match annotated width %d', page_id, page.imageWidth, image['width']) if image['height'] != page.imageHeight: LOG.error( 'Page "%s" height %d does not match annotated height %d', page_id, page.imageHeight, image['height']) # todo: remove existing segmentation first? for region in image['regions']: assert isinstance( region['segmentation'], list), "importing RLE/mask segmentation not implemented" polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2)) coords = CoordsType(points=points_from_polygon(polygon)) category = categories[region['category_id']] if region['category_id'] in subcategories: subcategory = subcategories[region['category_id']] else: subcategory = None region_id = 'r' + str(region['id']) LOG.info('Adding region %s:%s [area %d]', category, subcategory or '', region['area']) if coco_source == 'PubLayNet': if category == 'text': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.PARAGRAPH) page.add_TextRegion(region_obj) elif category == 'title': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.HEADING) # CAPTION? page.add_TextRegion(region_obj) elif category == 'list': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.LISTLABEL) # OTHER? page.add_TextRegion(region_obj) elif category == 'table': region_obj = TableRegionType(id=region_id, Coords=coords) page.add_TableRegion(region_obj) elif category == 'figure': region_obj = ImageRegionType(id=region_id, Coords=coords) page.add_ImageRegion(region_obj) else: raise Exception('unknown region category: %s' % category) else: # 'PAGE' args = {'id': region_id, 'Coords': coords} if subcategory: typedict = { "TextRegion": TextTypeSimpleType, "GraphicRegion": GraphicsTypeSimpleType, "ChartType": ChartTypeSimpleType } if category in typedict: subtype = membername(typedict[category], subcategory) if subtype == subcategory: # not predefined in PAGE: use other + custom args['custom'] = "subtype:%s" % subcategory args['type_'] = "other" else: args['type_'] = subcategory else: args['custom'] = "subtype:%s" % subcategory if category + 'Type' not in globals(): raise Exception('unknown region category: %s' % category) region_type = globals()[category + 'Type'] if region_type is BorderType: page.set_Border(BorderType(Coords=coords)) else: region_obj = region_type(**args) getattr(page, 'add_%s' % category)(region_obj) # remove image from dicts images_by_id.pop(num_page_id, None) images_by_filename.pop(page.imageFilename, None) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts)) # warn of remaining COCO images if images_by_filename and not self.page_id: LOG.warning('%d images remain unaccounted for after processing', len(images_by_filename)) if LOG.isEnabledFor(logging.DEBUG): for filename in images_by_filename: LOG.debug('not found in workspace: "%s"', filename)
def process(self): """ Binarize with sbb_binarization """ LOG = getLogger('processor.SbbBinarize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] model_path = self.resolve_resource(self.parameter['model']) binarizer = SbbBinarizer(model_dir=model_path, logger=LOG) for n, input_file in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) pcgts.set_pcGtsId(file_id) page = pcgts.get_Page() page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id, feature_filter='binarized') if oplevel == 'page': LOG.info("Binarizing on 'page' level in page '%s'", page_id) bin_image = cv2pil( binarizer.run(image=pil2cv(page_image), use_patches=True)) # update METS (add the image file): bin_image_path = self.workspace.save_image_file( bin_image, file_id + '.IMG-BIN', page_id=input_file.pageId, file_grp=self.output_file_grp) page.add_AlternativeImage( AlternativeImageType(filename=bin_image_path, comments='%s,binarized' % page_xywh['features'])) elif oplevel == 'region': regions = page.get_AllRegions(['Text', 'Table'], depth=1) if not regions: LOG.warning("Page '%s' contains no text/table regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') region_image_bin = cv2pil( binarizer.run(image=pil2cv(region_image), use_patches=True)) region_image_bin_path = self.workspace.save_image_file( region_image_bin, "%s_%s.IMG-BIN" % (file_id, region.id), page_id=input_file.pageId, file_grp=self.output_file_grp) region.add_AlternativeImage( AlternativeImageType(filename=region_image_bin_path, comments='%s,binarized' % region_xywh['features'])) elif oplevel == 'line': region_line_tuples = [ (r.id, r.get_TextLine()) for r in page.get_AllRegions(['Text'], depth=0) ] if not region_line_tuples: LOG.warning("Page '%s' contains no text lines", page_id) for region_id, line in region_line_tuples: line_image, line_xywh = self.workspace.image_from_segment( line, page_image, page_xywh, feature_filter='binarized') line_image_bin = cv2pil( binarizer.run(image=pil2cv(line_image), use_patches=True)) line_image_bin_path = self.workspace.save_image_file( line_image_bin, "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id), page_id=input_file.pageId, file_grp=self.output_file_grp) line.add_AlternativeImage( AlternativeImageType(filename=line_image_bin_path, comments='%s,binarized' % line_xywh['features'])) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Extract region images from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Extract an image for each region (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. Create a JSON file with: * the IDs of the region and its parents, * the region's coordinates relative to the region image, * the region's absolute coordinates, * the (text) region's text content (if any), * the (text) region's TextStyle (if any), * the (text) region's @production (if any), * the (text) region's @readingDirection (if any), * the (text) region's @textLineOrder (if any), * the (text) region's @primaryScript (if any), * the (text) region's @primaryLanguage (if any), * the region's AlternativeImage/@comments (features), * the region's element class, * the region's @type, * the page's @type, * the page's DPI value. Write all files in the directory of the output file group, named like so: * ID + '.raw.png': region image (if the workflow provides raw images) * ID + '.bin.png': region image (if the workflow provides binarized images) * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images) * ID + '.json': region metadata. """ LOG = getLogger('processor.ExtractRegions') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None ptype = page.get_type() regions = dict() for name in CLASSES.keys(): if not name or name == 'Border' or ':' in name: # no subtypes here continue regions[name] = getattr(page, 'get_' + name)() for rtype, rlist in regions.items(): for region in rlist: description = { 'region.ID': region.id, 'region.type': rtype } region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, transparency=self.parameter['transparency']) if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: subrtype = region.get_type() else: subrtype = None description['subtype'] = subrtype description['coords_rel'] = coordinates_of_segment( region, region_image, region_coords).tolist() description['coords_abs'] = polygon_from_points( region.get_Coords().points) if rtype == 'text': rtext = region.get_TextEquiv() if rtext: description['region.text'] = rtext[0].Unicode else: description['region.text'] = '' rstyle = region.get_TextStyle() or page.get_TextStyle() if rstyle: description['region.style'] = { 'fontFamily': rstyle.fontFamily, 'fontSize': rstyle.fontSize, 'xHeight': rstyle.xHeight, 'kerning': rstyle.kerning, 'serif': rstyle.serif, 'monospace': rstyle.monospace, 'bold': rstyle.bold, 'italic': rstyle.italic, 'smallCaps': rstyle.smallCaps, 'letterSpaced': rstyle.letterSpaced, 'strikethrough': rstyle.strikethrough, 'underlined': rstyle.underlined, 'underlineStyle': rstyle.underlineStyle, 'subscript': rstyle.subscript, 'superscript': rstyle.superscript } description['production'] = region.get_production() description['readingDirection'] = ( region.get_readingDirection() or page.get_readingDirection()) description['textLineOrder'] = ( region.get_textLineOrder() or page.get_textLineOrder()) description['primaryScript'] = ( region.get_primaryScript() or page.get_primaryScript()) description['primaryLanguage'] = ( region.get_primaryLanguage() or page.get_primaryLanguage()) description['features'] = region_coords['features'] description['DPI'] = dpi description['page.ID'] = page_id description['page.type'] = ptype description['file_grp'] = self.input_file_grp description[ 'METS.UID'] = self.workspace.mets.unique_identifier if 'binarized' in region_coords['features']: extension = '.bin' elif 'grayscale_normalized' in region_coords['features']: extension = '.nrm' else: extension = '.raw' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( region_image, file_id + '_' + region.id + extension, self.output_file_grp, pageId=input_file.pageId, mimetype=self.parameter['mimetype']) self.workspace.add_file( ID=file_id + '.json', file_grp=self.output_file_grp, local_filename=file_path.replace( extension + MIME_TO_EXT[self.parameter['mimetype']], '.json'), pageId=input_file.pageId, mimetype='application/json', content=json.dumps(description))
def process(self): """Performs deskewing of the page / region with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the region level for all text and table regions. Set up Tesseract to recognise the region image's orientation, skew and script (with both OSD and AnalyseLayout). Rotate the image accordingly, and annotate the angle, readingDirection and textlineOrder. Create a corresponding image file, and reference it as AlternativeImage in the element. Add the new image file to the workspace with the fileGrp USE given in the second position of the output fileGrp, or ``OCR-D-IMG-DESKEW``, and an ID based on input file and input element. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrDeskew') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] with PyTessBaseAPI( path=get_tessdata_path(), lang="osd", # osd required for legacy init! oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD! psm=PSM.AUTO_OSD) as tessapi: if oplevel == 'line': tessapi.SetVariable("min_characters_to_try", "15") for n, input_file in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed' if oplevel == 'page' else '') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) if oplevel == 'page': self._process_segment(tessapi, page, page_image, page_xywh, "page '%s'" % page_id, input_file.pageId, file_id) else: regions = page.get_AllRegions(classes=['Text', 'Table']) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed') if oplevel == 'region': self._process_segment(tessapi, region, region_image, region_xywh, "region '%s'" % region.id, input_file.pageId, file_id + '_' + region.id) elif isinstance(region, TextRegionType): lines = region.get_TextLine() if not lines: LOG.warning( "Page '%s' region '%s' contains no lines", page_id, region.id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) self._process_segment( tessapi, line, line_image, line_xywh, "line '%s'" % line.id, input_file.pageId, file_id + '_' + region.id + '_' + line.id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs segmentation evaluation with Shapely on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Return information on the plausibility of the segmentation into regions on the logging level. """ LOG = getLogger('processor.RepairSegmentation') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) sanitize = self.parameter['sanitize'] plausibilize = self.parameter['plausibilize'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # # validate segmentation (warn of children extending beyond their parents) # report = PageValidator.validate(ocrd_page=pcgts, page_textequiv_consistency='off', check_baseline=False) if not report.is_valid: errors = report.errors report.errors = [] for error in errors: if isinstance( error, (CoordinateConsistencyError, CoordinateValidityError)): if error.tag == 'Page': element = page.get_Border() elif error.tag.endswith('Region'): element = next( (region for region in page.get_AllRegions() if region.id == error.ID), None) elif error.tag == 'TextLine': element = next((line for region in page.get_AllRegions( classes=['Text']) for line in region.get_TextLine() if line.id == error.ID), None) elif error.tag == 'Word': element = next((word for region in page.get_AllRegions( classes=['Text']) for line in region.get_TextLine() for word in line.get_Word() if word.id == error.ID), None) elif error.tag == 'Glyph': element = next((glyph for region in page.get_AllRegions( classes=['Text']) for line in region.get_TextLine() for word in line.get_Word() for glyph in word.get_Glyph() if glyph.id == error.ID), None) else: LOG.error( "Unrepairable error for unknown segment type: %s", str(error)) report.add_error(error) continue if not element: LOG.error( "Unrepairable error for unknown segment element: %s", str(error)) report.add_error(error) continue if isinstance(error, CoordinateConsistencyError): try: ensure_consistent(element) except Exception as e: LOG.error(str(e)) report.add_error(error) continue else: ensure_valid(element) LOG.warning("Fixed %s for %s '%s'", error.__class__.__name__, error.tag, error.ID) if not report.is_valid: LOG.warning(report.to_xml()) # # plausibilize region segmentation (remove redundant text regions) # ro = page.get_ReadingOrder() if ro: rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() else: rogroup = None mark_for_deletion = list() # what regions get removed? mark_for_merging = dict( ) # what regions get merged into which regions? # cover recursive region structure (but compare only at the same level) parents = list( set([ region.parent_object_ for region in page.get_AllRegions(classes=['Text']) ])) for parent in parents: regions = parent.get_TextRegion() # sort by area to ensure to arrive at a total ordering compatible # with the topological sort along containment/equivalence arcs # (so we can avoid substituting regions with superregions that have # themselves been substituted/deleted): RegionPolygon = namedtuple('RegionPolygon', ['region', 'polygon']) regionspolys = sorted([ RegionPolygon( region, Polygon(polygon_from_points( region.get_Coords().points))) for region in regions ], key=lambda x: x.polygon.area) for i in range(0, len(regionspolys)): for j in range(i + 1, len(regionspolys)): region1 = regionspolys[i].region region2 = regionspolys[j].region poly1 = regionspolys[i].polygon poly2 = regionspolys[j].polygon LOG.debug('Comparing regions "%s" and "%s"', region1.id, region2.id) if poly1.almost_equals(poly2): LOG.warning( 'Page "%s" region "%s" is almost equal to "%s" %s', page_id, region2.id, region1.id, '(removing)' if plausibilize else '') mark_for_deletion.append(region2.id) elif poly1.contains(poly2): LOG.warning( 'Page "%s" region "%s" is within "%s" %s', page_id, region2.id, region1.id, '(removing)' if plausibilize else '') mark_for_deletion.append(region2.id) elif poly2.contains(poly1): LOG.warning( 'Page "%s" region "%s" is within "%s" %s', page_id, region1.id, region2.id, '(removing)' if plausibilize else '') mark_for_deletion.append(region1.id) elif poly1.overlaps(poly2): inter_poly = poly1.intersection(poly2) union_poly = poly1.union(poly2) LOG.debug( 'Page "%s" region "%s" overlaps "%s" by %f/%f', page_id, region1.id, region2.id, inter_poly.area / poly1.area, inter_poly.area / poly2.area) if union_poly.convex_hull.area >= poly1.area + poly2.area: # skip this pair -- combined polygon encloses previously free segments pass elif inter_poly.area / poly2.area > self.parameter[ 'plausibilize_merge_min_overlap']: LOG.warning( 'Page "%s" region "%s" is almost within "%s" %s', page_id, region2.id, region1.id, '(merging)' if plausibilize else '') mark_for_merging[region2.id] = region1 elif inter_poly.area / poly1.area > self.parameter[ 'plausibilize_merge_min_overlap']: LOG.warning( 'Page "%s" region "%s" is almost within "%s" %s', page_id, region1.id, region2.id, '(merging)' if plausibilize else '') mark_for_merging[region1.id] = region2 # TODO: more merging cases... #LOG.info('Intersection %i', poly1.intersects(poly2)) #LOG.info('Containment %i', poly1.contains(poly2)) #if poly1.intersects(poly2): # LOG.info('Area 1 %d', poly1.area) # LOG.info('Area 2 %d', poly2.area) # LOG.info('Area intersect %d', poly1.intersection(poly2).area) if plausibilize: # pass the regions sorted (see above) _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging) # # sanitize region segmentation (shrink to hull of lines) # if sanitize: self.sanitize_page(page, page_id) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Dewarp the lines of the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the TextLine level. Next, get each line image according to the layout annotation (from the alternative image of the line, or by cropping via coordinates into the higher-level image), and dewarp it (without resizing). Export the result as an image file. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-DEWARP`` along with further identification of the input element. Reference each new image in the AlternativeImage of the element. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) lines = region.get_TextLine() if not lines: self.logger.warning('Region %s contains no text lines', region.id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) self.logger.info( "About to dewarp page '%s' region '%s' line '%s'", page_id, region.id, line.id) try: dew_image = dewarp( line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) except InvalidLine as err: self.logger.error('cannot dewarp line "%s": %s', line.id, err) continue except InadequateLine as err: self.logger.warning('cannot dewarp line "%s": %s', line.id, err) # as a fallback, simply pad the image vertically # (just as dewarping would do on average, so at least # this line has similar margins as the others): dew_image = padvert(line_image, self.parameter['range']) # update METS (add the image file): file_path = self.workspace.save_image_file( dew_image, file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP', page_id=input_file.pageId, file_grp=self.output_file_grp) # update PAGE (reference the image file): alternative_image = line.get_AlternativeImage() line.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=line_xywh['features'] + ',dewarped')) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): """Perform OCR recognition with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested ``textequiv_level`` if it exists and ``overwrite_words`` is disabled, or to the line level otherwise. In the latter case, (remove any existing segmentation below the line level, and) create new segmentation below the line level if necessary. Set up Tesseract to recognise each segment's image (either from AlternativeImage or cropping the bounding box rectangle and masking it from the polygon outline) with the appropriate mode and ``model``. Put text and confidence results into the TextEquiv at ``textequiv_level``, removing any existing TextEquiv. Finally, make the higher levels consistent with these results by concatenation, ordered as appropriate for its readingDirection, textLineOrder, and ReadingOrder, and joined by whitespace, as appropriate for the respective level and Relation/join status. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrRecognize') LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] for sub_model in model.split('+'): if sub_model not in get_languages()[1]: raise Exception("configured model " + sub_model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: LOG.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) if maxlevel == 'glyph': # populate GetChoiceIterator() with LSTM models, too: tessapi.SetVariable("lstm_choice_mode", "2") # aggregate symbols tessapi.SetVariable("lstm_choice_iterations", "15") # squeeze out more best paths # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset? if self.parameter['char_whitelist']: tessapi.SetVariable("tessedit_char_whitelist", self.parameter['char_whitelist']) if self.parameter['char_blacklist']: tessapi.SetVariable("tessedit_char_blacklist", self.parameter['char_blacklist']) if self.parameter['char_unblacklist']: tessapi.SetVariable("tessedit_char_unblacklist", self.parameter['char_unblacklist']) # todo: determine relevancy of these variables: # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from paramter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Processing page '%s'", page_id) regions = itertools.chain.from_iterable( [page.get_TextRegion()] + [subregion.get_TextRegion() for subregion in page.get_TableRegion()]) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) else: self._process_regions(tessapi, regions, page_image, page_xywh) page_update_higher_textequiv_levels(maxlevel, pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested level. Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. - If ``overwrite_regions=True`` on ``page`` level, then delete any top-level TextRegions (along with ReadingOrder). - If ``overwrite_regions=True`` on ``table`` level, then delete any TextRegions in TableRegions (along with their OrderGroup). - If ``overwrite_lines=True`` on ``region`` level, then delete any TextLines in TextRegions. - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting - up to ``maximages`` large foreground images, - up to ``maxseps`` foreground h/v-line separators and - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: - If ``level-of-operation`` is ``region``, then append the new lines to the parent region. - If it is ``table``, then append the new lines to their respective regions, and append the new regions to the parent table. (Also, create an OrderedGroup for it as the parent's RegionRef.) - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.OcropySegment') # FIXME: allow passing a-priori info on reading order / textline order # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture # of different scripts; also, vertical writing needs internal rotation # because our line segmentation only works for horizontal writing) overwrite_lines = self.parameter['overwrite_lines'] overwrite_regions = self.parameter['overwrite_regions'] overwrite_separators = self.parameter['overwrite_separators'] overwrite_order = self.parameter['overwrite_order'] oplevel = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 LOG.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + page.get_GraphicRegion() + page.get_ChartRegion() + page.get_MapRegion() + page.get_MathsRegion() + page.get_ChemRegion() + page.get_MusicRegion() + page.get_AdvertRegion() + page.get_NoiseRegion() + page.get_UnknownRegion() + page.get_CustomRegion()) if oplevel == 'page' and overwrite_separators: page.set_SeparatorRegion([]) else: ignore.extend(page.get_SeparatorRegion()) # prepare reading order reading_order = dict() ro = page.get_ReadingOrder() if ro: rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() if rogroup: page_get_reading_order(reading_order, rogroup) # get segments to process / overwrite if oplevel == 'page': ignore.extend(page.get_TableRegion()) regions = list(page.get_TextRegion()) if regions: # page is already region-segmented if overwrite_regions: LOG.info('removing existing TextRegions in page "%s"', page_id) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): page.set_TextRegion([]) page.set_ReadingOrder(None) ro = None else: LOG.warning( 'keeping existing TextRegions in page "%s"', page_id) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: ro = ReadingOrderType() page.set_ReadingOrder(ro) rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() if not rogroup: # new top-level group rogroup = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(rogroup) # go get TextRegions with TextLines (and SeparatorRegions): self._process_element(page, ignore, page_image, page_coords, page_id, file_id, zoom, rogroup=rogroup) elif oplevel == 'table': ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: LOG.warning('Page "%s" contains no table regions', page_id) for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: LOG.info( 'removing existing TextRegions in table "%s"', region.id) region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) reading_order[ region.id] = page_subgroup_in_reading_order( roelem) else: LOG.warning( 'skipping table "%s" with existing TextRegions', region.id) continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') # ignore everything but the current table region subignore = regions + ignore subignore.remove(region) # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: LOG.warning( "Page '%s' table region '%s' is not referenced in reading order (%s)", page_id, region.id, "no target to add cells to") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(roelem) reading_order[region.id] = roelem elif isinstance( roelem, (OrderedGroupType, OrderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an ordered group (%s)", page_id, region.id, "cells will be appended") elif isinstance( roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an unordered group (%s)", page_id, region.id, "cells will not be appended") roelem = None else: # replace regionRef(Indexed) by group with same index and ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) self._process_element(region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id, zoom, rogroup=roelem) else: # 'region' regions = list(page.get_TextRegion()) # besides top-level text regions, line-segment any table cells, # and for tables without any cells, add a pseudo-cell for region in page.get_TableRegion(): subregions = region.get_TextRegion() if subregions: regions.extend(subregions) else: subregion = TextRegionType( id=region.id + '_text', Coords=region.get_Coords(), # as if generated from parser: parent_object_=region) region.add_TextRegion(subregion) regions.append(subregion) if not regions: LOG.warning('Page "%s" contains no text regions', page_id) for region in regions: if region.get_TextLine(): if overwrite_lines: LOG.info( 'removing existing TextLines in page "%s" region "%s"', page_id, region.id) region.set_TextLine([]) else: LOG.warning( 'keeping existing TextLines in page "%s" region "%s"', page_id, region.id) ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') # go get TextLines self._process_element(region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, zoom) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): """Performs binarization of the region / line with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the requested level. Set up Tesseract to recognize the segment image's layout, and get the binarized image. Create an image file, and reference it as AlternativeImage in the segment element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-BIN`` along with further identification of the input element. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for n, input_file in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id) LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) if oplevel == 'region': tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh, "region '%s'" % region.id, input_file.pageId, file_id + '_' + region.id) elif isinstance(region, TextRegionType): lines = region.get_TextLine() if not lines: LOG.warning( "Page '%s' region '%s' contains no text lines", page_id, region.id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) tessapi.SetPageSegMode(PSM.SINGLE_LINE) self._process_segment( tessapi, RIL.TEXTLINE, line, line_image, line_xywh, "line '%s'" % line.id, input_file.pageId, file_id + '_' + region.id + '_' + line.id) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, model): LOG = getLogger('OcrdAnybaseocrTiseg') if model: I = ocrolib.pil2array( page_image.resize((800, 1024), Image.ANTIALIAS)) I = np.array(I)[np.newaxis, :, :, :] LOG.info('I shape %s', I.shape) if len(I.shape) < 3: print('Wrong input shape. Image should have 3 channel') # get prediction #out = model.predict_segmentation( # inp=I, # out_fname="/tmp/out.png" #) out = model.predict(I) out = out.reshape((2048, 1600, 3)).argmax(axis=2) text_part = np.ones(out.shape) text_part[np.where(out == 1)] = 0 image_part = np.ones(out.shape) image_part[np.where(out == 2)] = 0 image_part = array(255 * (image_part), 'B') image_part = ocrolib.array2pil(image_part) text_part = array(255 * (text_part), 'B') text_part = ocrolib.array2pil(text_part) text_part = text_part.resize(page_image.size, Image.BICUBIC) image_part = image_part.resize(page_image.size, Image.BICUBIC) else: I = ocrolib.pil2array(page_image) if len(I.shape) > 2: I = np.mean(I, 2) I = 1 - I / I.max() rows, cols = I.shape # Generate Mask and Seed Images Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I) # Iseedfill: Union of Mask and Seed Images Iseedfill = self.pixSeedfillBinary(Imask, Iseed) # Dilation of Iseedfill mask = ones((3, 3)) Iseedfill = ndimage.binary_dilation(Iseedfill, mask) # Expansion of Iseedfill to become equal in size of I Iseedfill = self.expansion(Iseedfill, (rows, cols)) # Write Text and Non-Text images image_part = array((1 - I * Iseedfill), dtype=int) text_part = array((1 - I * (1 - Iseedfill)), dtype=int) bin_array = array(255 * (text_part > ocrolib.midrange(img_part)), 'B') text_part = ocrolib.array2pil(bin_array) bin_array = array(255 * (text_part > ocrolib.midrange(text_part)), 'B') image_part = ocrolib.array2pil(bin_array) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( image_part, file_id + "_img", page_id=page_id, file_grp=self.output_file_grp, ) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features'] + ',non_text')) page_xywh['features'] += ',clipped' file_path = self.workspace.save_image_file( text_part, file_id + "_txt", page_id=page_id, file_grp=self.output_file_grp, ) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))