def _process_region(self, it, region, rogroup, region_image, region_coords): LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, index = 0 if rogroup: for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 while it and not it.Empty(RIL.BLOCK): bbox = it.BoundingBox(RIL.BLOCK) polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, region_image, region_coords) points = points_from_polygon(polygon) coords = CoordsType(points=points) # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (but ignore non-text regions entirely) ID = region.id + "_%04d" % index subregion = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) block_type = it.BlockType() if block_type == PT.FLOWING_TEXT: pass elif block_type == PT.HEADING_TEXT: subregion.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: subregion.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: subregion.set_type(TextTypeSimpleType.CAPTION) elif block_type == PT.VERTICAL_TEXT: subregion.set_orientation(90.0) else: it.Next(RIL.BLOCK) continue LOG.info("Detected cell '%s': %s (%s)", ID, points, membername(PT, block_type)) region.add_TextRegion(subregion) if rogroup: rogroup.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # iterator increment # index += 1 it.Next(RIL.BLOCK)
def _process_words_in_line(self, result_it, line, line_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.WORD): LOG.warning("No text in line '%s'", line.id) return # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD): word_no = 0 while result_it and not result_it.Empty(RIL.WORD): word_id = '%s_word%04d' % (line.id, word_no) LOG.debug("Decoding text in word '%s'", word_id) bbox = result_it.BoundingBox(RIL.WORD) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, line_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, line) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) word = WordType(id=word_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', points) else: line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: word_style = TextStyleType( fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, bold=word_attributes['bold'] if 'bold' in word_attributes else None, italic=word_attributes['italic'] if 'italic' in word_attributes else None, underlined=word_attributes['underlined'] if 'underlined' in word_attributes else None, monospace=word_attributes['monospace'] if 'monospace' in word_attributes else None, serif=word_attributes['serif'] if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): word.add_TextEquiv(TextEquivType( Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD)/100)) if self.parameter['textequiv_level'] != 'word': self._process_glyphs_in_word(result_it, word, line_xywh) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: word_no += 1 result_it.Next(RIL.WORD)
def _process_glyphs_in_word(self, result_it, word, word_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.SYMBOL): LOG.debug("No glyph in word '%s'", word.id) return # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL): glyph_no = 0 while result_it and not result_it.Empty(RIL.SYMBOL): glyph_id = '%s_glyph%04d' % (word.id, glyph_no) LOG.debug("Decoding text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice? #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) bbox = result_it.BoundingBox(RIL.SYMBOL) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, word_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, word) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant glyph: %s', points) else: word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: glyph_no += 1 result_it.Next(RIL.SYMBOL)
def process(self): """ Performs the recognition. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list( self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """ Perform text recognition with Calamari on the workspace. If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character hypotheses down to ``glyph_conf_cutoff`` confidence threshold. """ log = getLogger('processor.CalamariRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector=self.features) for region in page.get_AllRegions(classes=['Text']): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector=self.features) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) line_images_np = [] line_coordss = [] for line in textlines: log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, feature_selector=self.features) if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.network_input_channels == 1): # We cannot use a feature selector for this since we don't # know whether the model expects (has been trained on) # binarized or grayscale images; but raw images are likely # always inadequate: log.warning( "Using raw image for line '%s' in region '%s'", line.id, region.id) line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) line_coordss.append(line_coords) raw_results_all = self.predictor.predict_raw( line_images_np, progress_bar=False) for line, line_coords, raw_results in zip( textlines, line_coordss, raw_results_all): for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: self.add_metadata(pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def test_polygon_from_x0y0x1y1(self): self.assertEqual(polygon_from_x0y0x1y1([100, 100, 200, 200]), [[100, 100], [200, 100], [200, 200], [100, 200]])
def _process_page(self, it, page, page_image, page_coords, page_id): LOG = getLogger('processor.TesserocrSegmentRegion') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, # and its BlockPolygon() index = 0 ro = page.get_ReadingOrder() if not ro: ro = ReadingOrderType() page.set_ReadingOrder(ro) og = ro.get_OrderedGroup() if og: # start counting from largest existing index for elem in (og.get_RegionRefIndexed() + og.get_OrderedGroupIndexed() + og.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 else: # new top-level group og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) while it and not it.Empty(RIL.BLOCK): # (padding will be passed to both BoundingBox and GetImage) # (actually, Tesseract honours padding only on the left and bottom, # whereas right and top are increased less!) bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding']) # sometimes these polygons are not planar, which causes # PIL.ImageDraw.Draw.polygon (and likely others as well) # to misbehave; however, PAGE coordinate semantics prohibit # multi-path polygons! # (probably a bug in Tesseract itself, cf. tesseract#2826): if self.parameter['crop_polygons']: polygon = it.BlockPolygon() else: polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, page_image, page_coords) polygon2 = polygon_for_parent(polygon, page) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) coords = CoordsType(points=points) if polygon2 is None: LOG.info('Ignoring extant region: %s', points) it.Next(RIL.BLOCK) continue # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # region_image_bin = it.GetBinaryImage(RIL.BLOCK) # if not region_image_bin.getbbox(): # LOG.info('Ignoring binary-empty region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (will be removed again if Separator/Noise region below) ID = "region%04d" % index og.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # region type switch # block_type = it.BlockType() if block_type in [ PT.FLOWING_TEXT, PT.HEADING_TEXT, PT.PULLOUT_TEXT, PT.CAPTION_TEXT, # TABLE is contained in PTIsTextType, but # it is a bad idea to create a TextRegion # for it (better set `find_tables` False): # PT.TABLE, # will also get a 90° @orientation # (but that can be overridden by deskew/OSD): PT.VERTICAL_TEXT ]: region = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) if block_type == PT.VERTICAL_TEXT: region.set_orientation(90.0) elif block_type == PT.HEADING_TEXT: region.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: region.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: region.set_type(TextTypeSimpleType.CAPTION) page.add_TextRegion(region) if self.parameter['sparse_text']: region.set_type(TextTypeSimpleType.OTHER) region.add_TextLine( TextLineType(id=region.id + '_line', Coords=coords)) elif block_type in [ PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE ]: region = ImageRegionType(id=ID, Coords=coords) page.add_ImageRegion(region) elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]: region = SeparatorRegionType(id=ID, Coords=coords) page.add_SeparatorRegion(region) # undo appending in ReadingOrder og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1]) elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]: region = MathsRegionType(id=ID, Coords=coords) page.add_MathsRegion(region) elif block_type == PT.TABLE: # without API access to StructuredTable we cannot # do much for a TableRegionType (i.e. nrows, ncols, # coordinates of cells for recursive regions etc), # but this can be achieved afterwards by segment-table region = TableRegionType(id=ID, Coords=coords) page.add_TableRegion(region) else: region = NoiseRegionType(id=ID, Coords=coords) page.add_NoiseRegion() # undo appending in ReadingOrder og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1]) LOG.info("Detected region '%s': %s (%s)", ID, points, membername(PT, block_type)) # # iterator increment # index += 1 it.Next(RIL.BLOCK) if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed() and not og.get_UnorderedGroupIndexed()): # schema forbids empty OrderedGroup ro.set_OrderedGroup(None)