def _split_word_at_space(word): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh = xywh_from_points(word.get_Coords().points) textequiv = word.get_TextEquiv()[0] pos = textequiv.Unicode.index(" ") fract = pos / len(textequiv.Unicode) xywh_prev = xywh.copy() xywh_prev.update({'w': xywh['w'] * fract}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh.copy() xywh_next.update({ 'x': xywh['x'] + xywh['w'] * fract, 'w': xywh['w'] * (1 - fract) }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) # Glyphs: irrelevant at this processing level textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos], conf=textequiv.conf) textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:], conf=textequiv.conf) prev_.set_TextEquiv([textequiv_prev]) next_.set_TextEquiv([textequiv_next]) return prev_, next_
def _split_word_at_glyph(word, glyph): prev_ = WordType(id=word.id + '_l') next_ = WordType(id=word.id + '_r') xywh_glyph = xywh_from_points(glyph.get_Coords().points) xywh_word = xywh_from_points(word.get_Coords().points) xywh_prev = xywh_word.copy() xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']}) prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev))) xywh_next = xywh_word.copy() xywh_next.update({ 'x': xywh_glyph['x'] - xywh_glyph['w'], 'w': xywh_word['w'] - xywh_prev['w'] }) next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next))) if word.get_language(): prev_.set_language(word.get_language()) next_.set_language(word.get_language()) if word.get_TextStyle(): prev_.set_TextStyle(word.get_TextStyle()) next_.set_TextStyle(word.get_TextStyle()) glyphs = word.get_Glyph() pos = glyphs.index(glyph) prev_.set_Glyph(glyphs[0:pos]) next_.set_Glyph(glyphs[pos + 1:]) # TextEquiv: will be overwritten by page_update_higher_textequiv_levels return prev_, next_
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def test_points_from_xywh(self): self.assertEqual( points_from_xywh({ 'x': 100, 'y': 100, 'w': 100, 'h': 100 }), '100,100 200,100 200,200 100,200')
def process(self): """ Performs the cropping. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Cropping with tesseract") tessapi.SetImage(image) # # helper variables for saving the box coordinates # min_x = image.width min_y = image.height max_x = 0 max_y = 0 # iterate over all boxes and compare their extent # to the min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) for pair in points.split(' '): x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) if x < min_x: min_x = x if y < min_y: min_y = y elif x > max_x: max_x = x elif y > max_y: max_y = y log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) # # set the identified page border # brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def _merge_words(prev_, next_): merged = WordType(id=prev_.id + '.' + next_.id) merged.set_Coords( CoordsType(points=points_from_xywh( xywh_from_points(prev_.get_Coords().points + ' ' + next_.get_Coords().points)))) if prev_.get_language(): merged.set_language(prev_.get_language()) if prev_.get_TextStyle(): merged.set_TextStyle(prev_.get_TextStyle()) if prev_.get_Glyph() or next_.get_Glyph(): merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph()) if prev_.get_TextEquiv(): merged.set_TextEquiv(prev_.get_TextEquiv()) else: merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)]) if next_.get_TextEquiv(): textequiv = merged.get_TextEquiv()[0] textequiv2 = next_.get_TextEquiv()[0] textequiv.Unicode += textequiv2.Unicode if textequiv.conf and textequiv2.conf: textequiv.conf *= textequiv2.conf return merged