Example #1
0
 def _process_region(self, it, region, rogroup, region_image,
                     region_coords):
     LOG = getLogger('processor.TesserocrSegmentTable')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     index = 0
     if rogroup:
         for elem in (rogroup.get_RegionRefIndexed() +
                      rogroup.get_OrderedGroupIndexed() +
                      rogroup.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     while it and not it.Empty(RIL.BLOCK):
         bbox = it.BoundingBox(RIL.BLOCK)
         polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, region_image,
                                           region_coords)
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (but ignore non-text regions entirely)
         ID = region.id + "_%04d" % index
         subregion = TextRegionType(id=ID,
                                    Coords=coords,
                                    type=TextTypeSimpleType.PARAGRAPH)
         block_type = it.BlockType()
         if block_type == PT.FLOWING_TEXT:
             pass
         elif block_type == PT.HEADING_TEXT:
             subregion.set_type(TextTypeSimpleType.HEADING)
         elif block_type == PT.PULLOUT_TEXT:
             subregion.set_type(TextTypeSimpleType.FLOATING)
         elif block_type == PT.CAPTION_TEXT:
             subregion.set_type(TextTypeSimpleType.CAPTION)
         elif block_type == PT.VERTICAL_TEXT:
             subregion.set_orientation(90.0)
         else:
             it.Next(RIL.BLOCK)
             continue
         LOG.info("Detected cell '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         region.add_TextRegion(subregion)
         if rogroup:
             rogroup.add_RegionRefIndexed(
                 RegionRefIndexedType(regionRef=ID, index=index))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
Example #2
0
 def _process_page(self, it, page, page_image, page_coords, page_id):
     LOG = getLogger('processor.TesserocrSegmentRegion')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     # and its BlockPolygon()
     index = 0
     ro = page.get_ReadingOrder()
     if not ro:
         ro = ReadingOrderType()
         page.set_ReadingOrder(ro)
     og = ro.get_OrderedGroup()
     if og:
         # start counting from largest existing index
         for elem in (og.get_RegionRefIndexed() +
                      og.get_OrderedGroupIndexed() +
                      og.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     else:
         # new top-level group
         og = OrderedGroupType(id="reading-order")
         ro.set_OrderedGroup(og)
     while it and not it.Empty(RIL.BLOCK):
         # (padding will be passed to both BoundingBox and GetImage)
         # (actually, Tesseract honours padding only on the left and bottom,
         #  whereas right and top are increased less!)
         bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
         # sometimes these polygons are not planar, which causes
         # PIL.ImageDraw.Draw.polygon (and likely others as well)
         # to misbehave; however, PAGE coordinate semantics prohibit
         # multi-path polygons!
         # (probably a bug in Tesseract itself, cf. tesseract#2826):
         if self.parameter['crop_polygons']:
             polygon = it.BlockPolygon()
         else:
             polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, page_image, page_coords)
         polygon2 = polygon_for_parent(polygon, page)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         if polygon2 is None:
             LOG.info('Ignoring extant region: %s', points)
             it.Next(RIL.BLOCK)
             continue
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         # region_image_bin = it.GetBinaryImage(RIL.BLOCK)
         # if not region_image_bin.getbbox():
         #     LOG.info('Ignoring binary-empty region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (will be removed again if Separator/Noise region below)
         ID = "region%04d" % index
         og.add_RegionRefIndexed(
             RegionRefIndexedType(regionRef=ID, index=index))
         #
         # region type switch
         #
         block_type = it.BlockType()
         if block_type in [
                 PT.FLOWING_TEXT,
                 PT.HEADING_TEXT,
                 PT.PULLOUT_TEXT,
                 PT.CAPTION_TEXT,
                 # TABLE is contained in PTIsTextType, but
                 # it is a bad idea to create a TextRegion
                 # for it (better set `find_tables` False):
                 # PT.TABLE,
                 # will also get a 90° @orientation
                 # (but that can be overridden by deskew/OSD):
                 PT.VERTICAL_TEXT
         ]:
             region = TextRegionType(id=ID,
                                     Coords=coords,
                                     type=TextTypeSimpleType.PARAGRAPH)
             if block_type == PT.VERTICAL_TEXT:
                 region.set_orientation(90.0)
             elif block_type == PT.HEADING_TEXT:
                 region.set_type(TextTypeSimpleType.HEADING)
             elif block_type == PT.PULLOUT_TEXT:
                 region.set_type(TextTypeSimpleType.FLOATING)
             elif block_type == PT.CAPTION_TEXT:
                 region.set_type(TextTypeSimpleType.CAPTION)
             page.add_TextRegion(region)
             if self.parameter['sparse_text']:
                 region.set_type(TextTypeSimpleType.OTHER)
                 region.add_TextLine(
                     TextLineType(id=region.id + '_line', Coords=coords))
         elif block_type in [
                 PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE
         ]:
             region = ImageRegionType(id=ID, Coords=coords)
             page.add_ImageRegion(region)
         elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]:
             region = SeparatorRegionType(id=ID, Coords=coords)
             page.add_SeparatorRegion(region)
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]:
             region = MathsRegionType(id=ID, Coords=coords)
             page.add_MathsRegion(region)
         elif block_type == PT.TABLE:
             # without API access to StructuredTable we cannot
             # do much for a TableRegionType (i.e. nrows, ncols,
             # coordinates of cells for recursive regions etc),
             # but this can be achieved afterwards by segment-table
             region = TableRegionType(id=ID, Coords=coords)
             page.add_TableRegion(region)
         else:
             region = NoiseRegionType(id=ID, Coords=coords)
             page.add_NoiseRegion()
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         LOG.info("Detected region '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
     if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed()
             and not og.get_UnorderedGroupIndexed()):
         # schema forbids empty OrderedGroup
         ro.set_OrderedGroup(None)