def _split_word_at_glyph(word, glyph): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh_glyph = xywh_from_points(glyph.get_Coords().points) xywh_word = xywh_from_points(word.get_Coords().points) xywh_prev = xywh_word.copy() xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh_word.copy() xywh_next.update({ 'x': xywh_glyph['x'] - xywh_glyph['w'], 'w': xywh_word['w'] - xywh_prev['w'] }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) glyphs = word.get_Glyph() pos = glyphs.index(glyph) prev_.set_Glyph(glyphs[0:pos]) next_.set_Glyph(glyphs[pos + 1:]) # TextEquiv: will be overwritten by page_update_higher_textequiv_levels return prev_, next_
def _process_existing_glyphs(self, glyphs, tessapi): for glyph in glyphs: log.debug("Recognizing glyph in word '%s'", glyph.id) glyph_xywh = xywh_from_points(glyph.get_Coords().points) tessapi.SetRectangle(glyph_xywh['x'], glyph_xywh['y'], glyph_xywh['w'], glyph_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_CHAR) if glyph.get_TextEquiv(): log.warning("Glyph '%s' already contained text results", glyph.id) glyph.set_TextEquiv([]) #glyph_text = tessapi.GetUTF8Text().rstrip("\n\f") glyph_conf = tessapi.AllWordConfidences() glyph_conf = glyph_conf[0] / 100.0 if glyph_conf else 0.0 #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) result_it = tessapi.GetIterator() if not result_it or result_it.Empty(RIL.SYMBOL): log.error("No glyph here") continue choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence() / 100 #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv( TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
def _process_existing_words(self, words, maxlevel, tessapi): for word in words: log.debug("Recognizing text in word '%s'", word.id) word_xywh = xywh_from_points(word.get_Coords().points) tessapi.SetRectangle(word_xywh['x'], word_xywh['y'], word_xywh['w'], word_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_WORD) if maxlevel == 'word': word_text = tessapi.GetUTF8Text().rstrip("\n\f") word_conf = tessapi.AllWordConfidences() word_conf = word_conf[0] / 100.0 if word_conf else 0.0 if word.get_TextEquiv(): log.warning("Word '%s' already contained text results", word.id) word.set_TextEquiv([]) # todo: consider WordFontAttributes (TextStyle) etc (if not word.get_TextStyle()) word.add_TextEquiv( TextEquivType(Unicode=word_text, conf=word_conf)) continue # next word (to avoid indentation below) ## glyph level: glyphs = word.get_Glyph() if glyphs: ## external glyph layout: log.warning( "Word '%s' contains glyphs already, recognition might be suboptimal", word.id) self._process_existing_glyphs(glyphs, tessapi) else: ## internal glyph layout: tessapi.Recognize() self._process_glyphs_in_word(word, tessapi.GetIterator())
def _process_lines(self, textlines, maxlevel, tessapi): for line in textlines: log.debug("Recognizing text in line '%s'", line.id) line_xywh = xywh_from_points(line.get_Coords().points) # log.debug("xywh: %s", line_xywh) tessapi.SetRectangle(line_xywh['x'], line_xywh['y'], line_xywh['w'], line_xywh['h']) tessapi.SetPageSegMode( PSM.SINGLE_LINE ) # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models if maxlevel == 'line': line_text = tessapi.GetUTF8Text().rstrip("\n\f") line_conf = tessapi.MeanTextConf( ) / 100.0 # iterator scores are arithmetic averages, too if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) # todo: consider BlankBeforeWord, SetLineSeparator line.add_TextEquiv( TextEquivType(Unicode=line_text, conf=line_conf)) continue # next line (to avoid indentation below) ## word, or glyph level: words = line.get_Word() if words: ## external word layout: log.warning( "Line '%s' contains words already, recognition might be suboptimal", line.id) self._process_existing_words(words, maxlevel, tessapi) else: ## internal word and glyph layout: tessapi.Recognize() self._process_words_in_line(line, maxlevel, tessapi.GetIterator())
def test_xywh_from_points(self): self.assertEqual(xywh_from_points('100,100 200,100 200,200 100,200'), { 'x': 100, 'y': 100, 'w': 100, 'h': 100 })
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def _split_word_at_space(word): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh = xywh_from_points(word.get_Coords().points) textequiv = word.get_TextEquiv()[0] pos = textequiv.Unicode.index(" ") fract = pos / len(textequiv.Unicode) xywh_prev = xywh.copy() xywh_prev.update({'w': xywh['w'] * fract}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh.copy() xywh_next.update({ 'x': xywh['x'] + xywh['w'] * fract, 'w': xywh['w'] * (1 - fract) }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) # Glyphs: irrelevant at this processing level textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos], conf=textequiv.conf) textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:], conf=textequiv.conf) prev_.set_TextEquiv([textequiv_prev]) next_.set_TextEquiv([textequiv_next]) return prev_, next_
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def test_xywh_from_points_unordered(self): self.assertEqual( xywh_from_points('500,500 100,100 200,100 200,200 100,200'), { 'x': 100, 'y': 100, 'w': 400, 'h': 400 })
def _process_regions(self, regions, maxlevel, tessapi): for region in regions: log.debug("Recognizing text in region '%s'", region.id) # todo: determine if and how this can still be used for region classification: # result_it = tessapi.GetIterator() # if not result_it or result_it.Empty(RIL.BLOCK) # ptype = result_it.BlockType() # PT.UNKNOWN # PT.FLOWING_TEXT # PT.HEADING_TEXT # PT.PULLOUT_TEXT # PT.EQUATION # PT.TABLE # PT.VERTICAL_TEXT # PT.CAPTION_TEXT # PT.HORZ_LINE # PT.VERT_LINE # PT.NOISE # PT.COUNT # ... if maxlevel == 'region': region_xywh = xywh_from_points(region.get_Coords().points) tessapi.SetRectangle(region_xywh['x'], region_xywh['y'], region_xywh['w'], region_xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_BLOCK) region_text = tessapi.GetUTF8Text().rstrip("\n\f") region_conf = tessapi.MeanTextConf( ) / 100.0 # iterator scores are arithmetic averages, too if region.get_TextEquiv(): log.warning("Region '%s' already contained text results", region.id) region.set_TextEquiv([]) # todo: consider SetParagraphSeparator region.add_TextEquiv( TextEquivType(Unicode=region_text, conf=region_conf)) continue # next region (to avoid indentation below) ## line, word, or glyph level: textlines = region.get_TextLine() if not textlines: log.warning("Region '%s' contains no text lines", region.id) else: self._process_lines(textlines, maxlevel, tessapi)
def _merge_words(prev_, next_): merged = WordType(id=prev_.id + '.' + next_.id) merged.set_Coords( CoordsType(points=points_from_xywh( xywh_from_points(prev_.get_Coords().points + ' ' + next_.get_Coords().points)))) if prev_.get_language(): merged.set_language(prev_.get_language()) if prev_.get_TextStyle(): merged.set_TextStyle(prev_.get_TextStyle()) if prev_.get_Glyph() or next_.get_Glyph(): merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph()) if prev_.get_TextEquiv(): merged.set_TextEquiv(prev_.get_TextEquiv()) else: merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)]) if next_.get_TextEquiv(): textequiv = merged.get_TextEquiv()[0] textequiv2 = next_.get_TextEquiv()[0] textequiv.Unicode += textequiv2.Unicode if textequiv.conf and textequiv2.conf: textequiv.conf *= textequiv2.conf return merged
def image_from_segment(workspace, segment, parent_image, parent_xywh): """Extract a segment image from its parent's image. Given a PIL.Image of the parent, `parent_image`, and its absolute coordinates, `parent_xywh`, and a PAGE segment (TextRegion / TextLine / Word / Glyph) object logically contained in it, `segment`, extract its PIL.Image from AlternativeImage (if it exists), or via cropping from `parent_image`. When cropping, respect any orientation angle annotated for the parent (from parent-level deskewing) by compensating the segment coordinates in an inverse transformation (translation to center, rotation, re-translation). Also, mind the difference between annotated and actual size of the parent (usually from deskewing), by a respective offset into the image. Cropping uses a polygon mask (not just the rectangle). When cropping, respect any orientation angle annotated for the segment (from segment-level deskewing) by rotating the cropped image, respectively. If the resulting segment image is larger than the bounding box of `segment`, pass down the segment's box coordinates with an offset of half the width/height difference. Return the extracted image, and the absolute coordinates of the segment's bounding box (for passing down). """ segment_xywh = xywh_from_points(segment.get_Coords().points) if 'orientation' in segment.__dict__: # angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: segment_xywh['angle'] = -(segment.get_orientation() or 0) alternative_image = segment.get_AlternativeImage() if alternative_image: # (e.g. from segment-level cropping, binarization, deskewing or despeckling) LOG.debug("Using AlternativeImage %d (%s) for segment '%s'", len(alternative_image), alternative_image[-1].get_comments(), segment.id) segment_image = workspace.resolve_image_as_pil( alternative_image[-1].get_filename()) else: # get polygon outline of segment relative to parent image: segment_polygon = coordinates_of_segment(segment, parent_image, parent_xywh) # create a mask from the segment polygon: segment_image = image_from_polygon(parent_image, segment_polygon) # recrop into segment rectangle: segment_image = crop_image( segment_image, box=(segment_xywh['x'] - parent_xywh['x'], segment_xywh['y'] - parent_xywh['y'], segment_xywh['x'] - parent_xywh['x'] + segment_xywh['w'], segment_xywh['y'] - parent_xywh['y'] + segment_xywh['h'])) # note: We should mask overlapping neighbouring segments here, # but finding the right clipping rules can be difficult if operating # on the raw (non-binary) image data alone: for each intersection, it # must be decided which one of either segment or neighbour to assign, # e.g. an ImageRegion which properly contains our TextRegion should be # completely ignored, but an ImageRegion which is properly contained # in our TextRegion should be completely masked, while partial overlap # may be more difficult to decide. On the other hand, on the binary image, # we can use connected component analysis to mask foreground areas which # originate in the neighbouring regions. But that would introduce either # the assumption that the input has already been binarized, or a dependency # on some ad-hoc binarization method. Thus, it is preferable to use # a dedicated processor for this (which produces clipped AlternativeImage # or reduced polygon coordinates). if 'angle' in segment_xywh and segment_xywh['angle']: LOG.info("About to rotate segment '%s' by %.2f°", segment.id, segment_xywh['angle']) segment_image = segment_image.rotate( segment_xywh['angle'], expand=True, #resample=Image.BILINEAR, fillcolor='white') # subtract offset from any increase in binary region size over source: segment_xywh['x'] -= round(0.5 * max(0, segment_image.width - segment_xywh['w'])) segment_xywh['y'] -= round( 0.5 * max(0, segment_image.height - segment_xywh['h'])) return segment_image, segment_xywh
def image_from_page(workspace, page, page_id): """Extract the Page image from the workspace. Given a PageType object, `page`, extract its PIL.Image from AlternativeImage if it exists. Otherwise extract the PIL.Image from imageFilename and crop it if a Border exists. Otherwise just return it. When cropping, respect any orientation angle annotated for the page (from page-level deskewing) by rotating the cropped image, respectively. If the resulting page image is larger than the bounding box of `page`, pass down the page's box coordinates with an offset of half the width/height difference. Return the extracted image, and the absolute coordinates of the page's bounding box / border (for passing down), and an OcrdExif instance associated with the original image. """ page_image = workspace.resolve_image_as_pil(page.imageFilename) page_image_info = OcrdExif(page_image) page_xywh = {'x': 0, 'y': 0, 'w': page_image.width, 'h': page_image.height} # region angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: page_xywh['angle'] = -(page.get_orientation() or 0) # FIXME: remove PrintSpace here as soon as GT abides by the PAGE standard: border = page.get_Border() or page.get_PrintSpace() if border: page_points = border.get_Coords().points LOG.debug("Using explictly set page border '%s' for page '%s'", page_points, page_id) page_xywh = xywh_from_points(page_points) alternative_image = page.get_AlternativeImage() if alternative_image: # (e.g. from page-level cropping, binarization, deskewing or despeckling) # assumes implicit cropping (i.e. page_xywh has been applied already) LOG.debug("Using AlternativeImage %d (%s) for page '%s'", len(alternative_image), alternative_image[-1].get_comments(), page_id) page_image = workspace.resolve_image_as_pil( alternative_image[-1].get_filename()) elif border: # get polygon outline of page border: page_polygon = np.array(polygon_from_points(page_points)) # create a mask from the page polygon: page_image = image_from_polygon(page_image, page_polygon) # recrop into page rectangle: page_image = crop_image(page_image, box=(page_xywh['x'], page_xywh['y'], page_xywh['x'] + page_xywh['w'], page_xywh['y'] + page_xywh['h'])) if 'angle' in page_xywh and page_xywh['angle']: LOG.info("About to rotate page '%s' by %.2f°", page_id, page_xywh['angle']) page_image = page_image.rotate( page_xywh['angle'], expand=True, #resample=Image.BILINEAR, fillcolor='white') # subtract offset from any increase in binary region size over source: page_xywh['x'] -= round(0.5 * max(0, page_image.width - page_xywh['w'])) page_xywh['y'] -= round(0.5 * max(0, page_image.height - page_xywh['h'])) return page_image, page_xywh, page_image_info