Ejemplo n.º 1
0
    def process(self):
        """
        Segment with ocropy
        """

        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            page_width = pcgts.get_Page().get_imageWidth()
            page_height = pcgts.get_Page().get_imageHeight()
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            binary = ocrolib.read_image_binary(
                self.workspace.download_url(image_url))
            binary = 1 - binary

            scale = self.parameter['scale'] if self.parameter[
                'scale'] != 0 else psegutils.estimate_scale(binary)
            log.debug(binary)

            pseg = self.compute_segmentation(binary, scale)
            log.debug("pseg=%s", pseg)

            # TODO reading order / enumber
            #  log.debug("finding reading order")
            #  lines = psegutils.compute_lines(pseg, scale)
            #  order = psegutils.reading_order([l.bounds for l in lines])
            #  lsort = psegutils.topsort(order)

            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)

            dummyRegion = TextRegionType(
                id="dummy",
                Coords=CoordsType(
                    points="0,0 %s,0 %s,%s 0,%s" %
                    (page_width, page_width, page_height, page_height)))
            pcgts.get_Page().add_TextRegion(dummyRegion)

            for lineno in range(1, regions.length()):
                log.debug("id=%s bbox=%s", regions.id(lineno),
                          regions.bbox(lineno))
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(
                        points=points_from_y0x0y1x1(regions.bbox(lineno))))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts))
Ejemplo n.º 2
0
 def _process_region(self, it, region, rogroup, region_image,
                     region_coords):
     LOG = getLogger('processor.TesserocrSegmentTable')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     index = 0
     if rogroup:
         for elem in (rogroup.get_RegionRefIndexed() +
                      rogroup.get_OrderedGroupIndexed() +
                      rogroup.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     while it and not it.Empty(RIL.BLOCK):
         bbox = it.BoundingBox(RIL.BLOCK)
         polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, region_image,
                                           region_coords)
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (but ignore non-text regions entirely)
         ID = region.id + "_%04d" % index
         subregion = TextRegionType(id=ID,
                                    Coords=coords,
                                    type=TextTypeSimpleType.PARAGRAPH)
         block_type = it.BlockType()
         if block_type == PT.FLOWING_TEXT:
             pass
         elif block_type == PT.HEADING_TEXT:
             subregion.set_type(TextTypeSimpleType.HEADING)
         elif block_type == PT.PULLOUT_TEXT:
             subregion.set_type(TextTypeSimpleType.FLOATING)
         elif block_type == PT.CAPTION_TEXT:
             subregion.set_type(TextTypeSimpleType.CAPTION)
         elif block_type == PT.VERTICAL_TEXT:
             subregion.set_orientation(90.0)
         else:
             it.Next(RIL.BLOCK)
             continue
         LOG.info("Detected cell '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         region.add_TextRegion(subregion)
         if rogroup:
             rogroup.add_RegionRefIndexed(
                 RegionRefIndexedType(regionRef=ID, index=index))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names):

        img_array = ocrolib.pil2array(page_image)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        page_xywh['features'] += ',blksegmented'

        for i in range(len(r['rois'])):

            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            #small post-processing incase of paragrapgh to not cut last alphabets
            if (min_x - 5) > width and r['class_ids'][i] == 2:
                min_x -= 5
            if (max_x + 10) < width and r['class_ids'][i] == 2:
                min_x += 10

            # this can be tested, provided whether we need previous comments or not?

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  #extract from points and img_array
            region_img = ocrolib.array2pil(region_img)
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(region_img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)

            ai = AlternativeImageType(filename=file_path,
                                      comments=page_xywh['features'])
            coords = CoordsType(
                "%i,%i %i,%i %i,%i %i,%i" %
                (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))
            textregion = TextRegionType(Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            textregion.add_AlternativeImage(ai)
            page.add_TextRegion(textregion)
Ejemplo n.º 4
0
 def render_text_region(self, text_region: TextRegionType) -> None:
     line: TextLineType
     word: WordType
     glyph: GlyphType
     for line in text_region.get_TextLine():
         self.render_type(line)
         for word in line.get_Word():
             self.render_type(word)
             for glyph in word.get_Glyph():
                 self.render_type(glyph)
Ejemplo n.º 5
0
    def process(self):
        """
        Segment with kraken
        """
        log = getLogger('processor.KrakenSegment')
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            im = self.workspace.resolve_image_as_pil(image_url)

            log.info('Segmenting')
            log.info('Params %s', self.parameter)
            res = segment(im, self.parameter['text_direction'],
                          self.parameter['scale'],
                          self.parameter['maxcolseps'],
                          self.parameter['black_colseps'])
            if self.parameter['script_detect']:
                res = detect_scripts(im, res)

            dummyRegion = TextRegionType()
            pcgts.get_Page().add_TextRegion(dummyRegion)
            #  print(res)
            for lineno, box in enumerate(res['boxes']):
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(points=points_from_x0y0x1y1(box)))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(self.output_file_grp,
                                    pageId=input_file.pageId,
                                    ID=ID,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))
Ejemplo n.º 6
0
def test_alternative_image_additions():
    pcgts = PcGtsType(pcGtsId="foo")
    assert pcgts.pcGtsId == 'foo'

    # act
    # Page/AlternativeImage
    page = PageType()
    pcgts.set_Page(page)
    page.add_AlternativeImage(AlternativeImageType())
    # TextRegion/AlternativeImage
    region = TextRegionType()
    page.add_TextRegion(region)
    region.add_AlternativeImage(AlternativeImageType())
    # TextLine/AlternativeImage
    line = TextLineType()
    region.add_TextLine(line)
    line.add_AlternativeImage(AlternativeImageType())
    # Word/AlternativeImage
    word = WordType()
    line.add_Word(word)
    word.add_AlternativeImage(AlternativeImageType())
    # Glyph/AlternativeImage
    glyph = GlyphType()
    word.add_Glyph(glyph)
    glyph.add_AlternativeImage(AlternativeImageType())
Ejemplo n.º 7
0
    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)
                    # <pg:ReadingOrder>
                    ro = pcgts.get_Page().get_ReadingOrder()
                    if ro is None:
                        ro = ReadingOrderType()
                        pcgts.get_Page().set_ReadingOrder(ro)
                    # <pg:OrderedGroup>
                    og = ro.get_OrderedGroup()
                    if og is None:
                        og = OrderedGroupType(id="reading-order")
                        ro.set_OrderedGroup(og)
                    # <pg:RegionRefIndexed>
                    og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index))

                    #
                    #  text region
                    #
                    pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points)))

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
        def add_region(region: RectSegment, index: int, region_type: str):
            from ocrd_utils import coordinates_for_segment, points_from_polygon
            polygon = polygon_from_segment(region)
            polygon = coordinates_for_segment(polygon, page_image, page_coords)
            points = points_from_polygon(polygon)

            indexed_id = "region%04d" % index
            coords = CoordsType(points=points)
            if region_type == "text":
                page.add_TextRegion(
                    TextRegionType(id=indexed_id, Coords=coords))
            elif region_type == "image":
                page.add_ImageRegion(
                    ImageRegionType(id=indexed_id, Coords=coords))
            else:
                page.add_NoiseRegion(
                    NoiseRegionType(id=indexed_id, Coords=coords))
Ejemplo n.º 9
0
 def add_region(region: Segment, index: int, type: str):
     indexed_id = "region%04d" % index
     points = str([
         (region.x_start, region.y_start),
         (region.x_start, region.y_end),
         (region.x_end, region.y_start),
         (region.x_end, region.y_end),
     ])
     coords = CoordsType(points=points)
     if type == "text":
         page.add_TextRegion(
             TextRegionType(id=indexed_id, Coords=coords))
     elif type == "image":
         page.add_ImageRegion(
             ImageRegionType(id=indexed_id, Coords=coords))
     else:
         page.add_NoiseRegion(
             NoiseRegionType(id=indexed_id, Coords=coords))
Ejemplo n.º 10
0
    def _process_page(page, page_image, page_xywh, pageId, file_id):
        settings = SegmentationSettings(debug=False,
                                        enable_preprocessing=False)
        # TODO: does this still need to be cropped or do we not need page_xywh?
        #       Same for points below
        #       page_image[page_xywh["x"]:page_xywh["w"], page_xywh["y"]:page_xywh["h"]]
        regions, classification = Segmentator(settings).segmentate_image(
            np.asarray(page_image))

        count = 0
        for region, prediction in zip(regions, classification):
            ID = "region%04d" % count
            points = str(list(region.exterior.coords))
            coords = CoordsType(points=points)
            # FIXME: these are not all types in the model, also check if they match
            if prediction == 1:
                page.add_TextRegion(TextRegionType(id=ID, Coords=coords))
            elif prediction == 2:
                page.add_ImageRegion(ImageRegionType(id=ID, Coords=coords))
            else:
                page.add_NoiseRegion(NoiseRegionType(id=ID, Coords=coords))
            count += 1
Ejemplo n.º 11
0
 def test_alternativeImage(self):
     pcgts = PcGtsType(pcGtsId="foo")
     self.assertEqual(pcgts.pcGtsId, 'foo')
     # Page/AlternativeImage
     page = PageType()
     pcgts.set_Page(page)
     page.add_AlternativeImage(AlternativeImageType())
     # TextRegion/AlternativeImage
     region = TextRegionType()
     page.add_TextRegion(region)
     region.add_AlternativeImage(AlternativeImageType())
     # TextLine/AlternativeImage
     line = TextLineType()
     region.add_TextLine(line)
     line.add_AlternativeImage(AlternativeImageType())
     # Word/AlternativeImage
     word = WordType()
     line.add_Word(word)
     word.add_AlternativeImage(AlternativeImageType())
     # Glyph/AlternativeImage
     glyph = GlyphType()
     word.add_Glyph(glyph)
     glyph.add_AlternativeImage(AlternativeImageType())
Ejemplo n.º 12
0
    def build_pagexml_full_layout(
            self, found_polygons_text_region, found_polygons_text_region_h,
            page_coord, order_of_texts, id_of_texts,
            all_found_texline_polygons, all_found_texline_polygons_h,
            all_box_coord, all_box_coord_h, found_polygons_text_region_img,
            found_polygons_tables, found_polygons_drop_capitals,
            found_polygons_marginals, all_found_texline_polygons_marginals,
            all_box_coord_marginals, slopes, slopes_h, slopes_marginals,
            cont_page, polygons_lines_to_be_written_in_xml):
        self.logger.debug('enter build_pagexml_full_layout')

        # create the file structure
        pcgts = self.pcgts if self.pcgts else create_page_xml(
            self.image_filename, self.height_org, self.width_org)
        page = pcgts.get_Page()
        page.set_Border(
            BorderType(Coords=CoordsType(
                points=self.calculate_page_coords(cont_page))))

        counter = EynollahIdCounter()
        _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts))
        id_of_marginalia = [
            _counter_marginals.next_region_id for _ in found_polygons_marginals
        ]
        xml_reading_order(page, order_of_texts, id_of_marginalia)

        for mm in range(len(found_polygons_text_region)):
            textregion = TextRegionType(
                id=counter.next_region_id,
                type_='paragraph',
                Coords=CoordsType(points=self.calculate_polygon_coords(
                    found_polygons_text_region[mm], page_coord)))
            page.add_TextRegion(textregion)
            self.serialize_lines_in_region(textregion,
                                           all_found_texline_polygons, mm,
                                           page_coord, all_box_coord, slopes,
                                           counter)

        self.logger.debug('len(found_polygons_text_region_h) %s',
                          len(found_polygons_text_region_h))
        for mm in range(len(found_polygons_text_region_h)):
            textregion = TextRegionType(
                id=counter.next_region_id,
                type_='header',
                Coords=CoordsType(points=self.calculate_polygon_coords(
                    found_polygons_text_region_h[mm], page_coord)))
            page.add_TextRegion(textregion)
            self.serialize_lines_in_region(textregion,
                                           all_found_texline_polygons_h, mm,
                                           page_coord, all_box_coord_h,
                                           slopes_h, counter)

        for mm in range(len(found_polygons_marginals)):
            marginal = TextRegionType(
                id=counter.next_region_id,
                type_='marginalia',
                Coords=CoordsType(points=self.calculate_polygon_coords(
                    found_polygons_marginals[mm], page_coord)))
            page.add_TextRegion(marginal)
            self.serialize_lines_in_marginal(
                marginal, all_found_texline_polygons_marginals, mm, page_coord,
                all_box_coord_marginals, slopes_marginals, counter)

        for mm in range(len(found_polygons_drop_capitals)):
            page.add_TextRegion(
                TextRegionType(
                    id=counter.next_region_id,
                    type_='drop-capital',
                    Coords=CoordsType(points=self.calculate_polygon_coords(
                        found_polygons_drop_capitals[mm], page_coord))))

        for mm in range(len(found_polygons_text_region_img)):
            page.add_ImageRegion(
                ImageRegionType(
                    id=counter.next_region_id,
                    Coords=CoordsType(points=self.calculate_polygon_coords(
                        found_polygons_text_region_img[mm], page_coord))))

        for mm in range(len(polygons_lines_to_be_written_in_xml)):
            page.add_SeparatorRegion(
                ImageRegionType(
                    id=counter.next_region_id,
                    Coords=CoordsType(points=self.calculate_polygon_coords(
                        polygons_lines_to_be_written_in_xml[mm],
                        [0, 0, 0, 0]))))

        for mm in range(len(found_polygons_tables)):
            page.add_TableRegion(
                TableRegionType(
                    id=counter.next_region_id,
                    Coords=CoordsType(points=self.calculate_polygon_coords(
                        found_polygons_tables[mm], page_coord))))

        return pcgts
Ejemplo n.º 13
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, mask, dpi):
        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        # check for existing text regions and whether to overwrite them
        if page.get_TextRegion() or page.get_TableRegion():
            if self.parameter['overwrite']:
                LOG.info('removing existing text/table regions in page "%s"',
                         page_id)
                page.set_TextRegion([])
            else:
                LOG.warning('keeping existing text/table regions in page "%s"',
                            page_id)
        # check if border exists
        border_polygon = None
        if page.get_Border():
            border_coords = page.get_Border().get_Coords()
            border_points = polygon_from_points(border_coords.get_points())
            border_polygon = Polygon(border_points)

        LOG.info('detecting regions on page "%s"', page_id)
        img_array = ocrolib.pil2array(page_image)
        if len(img_array.shape) <= 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        # convert to incidence matrix
        class_ids = np.array([[
            1 if category in self.parameter['active_classes'] else 0
            for category in CLASS_NAMES
        ]],
                             dtype=np.int32)
        results = self.mrcnn_model.detect([img_array],
                                          verbose=0,
                                          active_class_ids=class_ids)
        r = results[0]
        LOG.info('found %d candidates on page "%s"', len(r['rois']), page_id)

        th = self.parameter['th']
        # check for existing semgentation mask
        # this code executes only when the workflow had tiseg run before with use_deeplr=true
        if mask:
            mask = ocrolib.pil2array(mask)
            mask = mask // 255
            mask = 1 - mask
            # multiply all the bounding box part with 2
            for i in range(len(r['rois'])):

                min_y, min_x, max_y, max_x = r['rois'][i]
                mask[min_y:max_y, min_x:max_x] *= i + 2

            # check for left over pixels and add them to the bounding boxes
            pixel_added = True

            while pixel_added:

                pixel_added = False
                left_over = np.where(mask == 1)
                for y, x in zip(left_over[0], left_over[1]):
                    local_mask = mask[y - th:y + th, x - th:x + th]
                    candidates = np.where(local_mask > 1)
                    candidates = [k for k in zip(candidates[0], candidates[1])]
                    if len(candidates) > 0:
                        pixel_added = True
                        # find closest pixel with x>1
                        candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 +
                                                              (j[1] - th)**2))
                        index = local_mask[candidates[0]] - 2

                        # add pixel to mask/bbox
                        # y,x to bbox with index
                        if y < r['rois'][index][0]:
                            r['rois'][index][0] = y

                        elif y > r['rois'][index][2]:
                            r['rois'][index][2] = y

                        if x < r['rois'][index][1]:
                            r['rois'][index][1] = x

                        elif x > r['rois'][index][3]:
                            r['rois'][index][3] = x

                        # update the mask
                        mask[y, x] = index + 2

        for i in range(len(r['rois'])):
            class_id = r['class_ids'][i]
            if class_id >= len(CLASS_NAMES):
                raise Exception(
                    'Unexpected class id %d - model does not match' % class_id)

        # find hull contours on masks
        if self.parameter['use_masks']:
            r.setdefault('polygons', list())
            # estimate glyph scale (roughly)
            scale = int(dpi / 6)
            scale = scale + (scale + 1) % 2  # odd
            for i in range(len(r['rois'])):
                mask = r['masks'][:, :, i]
                mask = cv2.dilate(mask.astype(np.uint8),
                                  np.ones((scale, scale), np.uint8)) > 0
                # close mask until we have a single outer contour
                contours = None
                for _ in range(10):
                    mask = cv2.morphologyEx(
                        mask.astype(np.uint8), cv2.MORPH_CLOSE,
                        np.ones((scale, scale), np.uint8)) > 0
                    contours, _ = cv2.findContours(mask.astype(np.uint8),
                                                   cv2.RETR_EXTERNAL,
                                                   cv2.CHAIN_APPROX_SIMPLE)
                    if len(contours) == 1:
                        break
                r['polygons'].append(Polygon(
                    contours[0][:, 0, :]))  # already in x,y order

        # to reduce overlaps, apply IoU-based non-maximum suppression
        # (and other post-processing against overlaps) across classes,
        # but not on the raw pixels, but the smoothed hull polygons
        LOG.info('post-processing detections on page "%s"', page_id)
        worse = []
        if self.parameter['post_process']:
            active = True

            def _merge_rois(i, j):
                """merges i into j"""
                nonlocal r, active
                r['rois'][j][0] = min(r['rois'][i][0], r['rois'][j][0])
                r['rois'][j][1] = min(r['rois'][i][1], r['rois'][j][1])
                r['rois'][j][2] = max(r['rois'][i][2], r['rois'][j][2])
                r['rois'][j][3] = max(r['rois'][i][3], r['rois'][j][3])
                r['polygons'][j] = r['polygons'][i].union(r['polygons'][j])
                #r['scores'][j] = max(r['scores'][i], r['scores'][i])
                active = True

            # find overlapping pairs
            while active:
                active = False
                for i in range(len(r["class_ids"])):
                    if i in worse:
                        continue
                    for j in range(i + 1, len(r['class_ids'])):
                        if j in worse:
                            continue
                        iclass = r['class_ids'][i]
                        jclass = r['class_ids'][j]
                        iname = CLASS_NAMES[iclass]
                        jname = CLASS_NAMES[jclass]
                        if (iname == 'drop-capital') != (jname
                                                         == 'drop-capital'):
                            # ignore drop-capital overlapping with others
                            continue
                        # rs todo: lower priority for footnote?
                        if (r['rois'][i][1] > r['rois'][j][3]
                                or r['rois'][i][3] < r['rois'][j][1]
                                or r['rois'][i][0] > r['rois'][j][2]
                                or r['rois'][i][2] < r['rois'][j][0]):
                            # no overlap (cut)
                            continue
                        iscore = r['scores'][i]
                        jscore = r['scores'][j]
                        if not self.parameter['use_masks']:
                            LOG.debug(
                                "roi %d[%s] overlaps roi %d[%s] and %s (replacing)",
                                i, iname, j, jname,
                                "looses" if iscore < jscore else "wins")
                            if iscore < jscore:
                                worse.append(i)
                                break
                            else:
                                worse.append(j)
                                continue
                        # compare masks
                        ipoly = r['polygons'][i]
                        jpoly = r['polygons'][j]
                        isize = ipoly.area
                        jsize = jpoly.area
                        inter = ipoly.intersection(jpoly).area
                        union = ipoly.union(jpoly).area
                        # LOG.debug("%d/%d %dpx/%dpx shared %dpx overall %dpx",
                        #           i, j, isize, jsize, inter, union)
                        if inter / isize > self.parameter['min_share_drop']:
                            LOG.debug(
                                "roi %d[%s] contains roi %d[%s] (replacing)",
                                j, jname, i, iname)
                            worse.append(i)
                            break
                        elif inter / jsize > self.parameter['min_share_drop']:
                            LOG.debug(
                                "roi %d[%s] contains roi %d[%s] (replacing)",
                                i, iname, j, jname)
                            worse.append(j)
                        elif inter / union > self.parameter['min_iou_drop']:
                            LOG.debug(
                                "roi %d[%s] heavily overlaps roi %d[%s] and %s (replacing)",
                                i, iname, j, jname,
                                "looses" if iscore < jscore else "wins")
                            if iscore < jscore:
                                worse.append(i)
                                break
                            else:
                                worse.append(j)
                        elif inter / isize > self.parameter['min_share_merge']:
                            LOG.debug("roi %d[%s] covers roi %d[%s] (merging)",
                                      j, jname, i, iname)
                            worse.append(i)
                            _merge_rois(i, j)
                            break
                        elif inter / jsize > self.parameter['min_share_merge']:
                            LOG.debug("roi %d[%s] covers roi %d[%s] (merging)",
                                      i, iname, j, jname)
                            worse.append(j)
                            _merge_rois(j, i)
                        elif inter / union > self.parameter['min_iou_merge']:
                            LOG.debug(
                                "roi %d[%s] slightly overlaps roi %d[%s] and %s (merging)",
                                i, iname, j, jname,
                                "looses" if iscore < jscore else "wins")
                            if iscore < jscore:
                                worse.append(i)
                                _merge_rois(i, j)
                                break
                            else:
                                worse.append(j)
                                _merge_rois(j, i)

        # define reading order on basis of coordinates
        partial_order = np.zeros((len(r['rois']), len(r['rois'])), np.uint8)
        for i, (min_y_i, min_x_i, max_y_i, max_x_i) in enumerate(r['rois']):
            for j, (min_y_j, min_x_j, max_y_j,
                    max_x_j) in enumerate(r['rois']):
                if min_x_i < max_x_j and max_x_i > min_x_j:
                    # xoverlaps
                    if min_y_i < min_y_j:
                        partial_order[i, j] = 1
                else:
                    min_y = min(min_y_i, min_y_j)
                    max_y = max(max_y_i, max_y_j)
                    min_x = min(min_x_i, min_x_j)
                    max_x = max(max_x_i, max_x_j)
                    if next(
                        (False
                         for (min_y_k, min_x_k, max_y_k, max_x_k) in r['rois']
                         if (min_y_k < max_y and max_y_k > min_y
                             and min_x_k < max_x and max_x_k > min_x)), True):
                        # no k in between
                        if ((min_y_j + max_y_j) / 2 < min_y_i
                                and (min_y_i + max_y_i) / 2 > max_y_j):
                            # vertically unrelated
                            partial_order[j, i] = 1
                        elif max_x_i < min_x_j:
                            partial_order[i, j] = 1

        def _topsort(po):
            visited = np.zeros(po.shape[0], np.bool)
            result = list()

            def _visit(k):
                if visited[k]:
                    return
                visited[k] = True
                for l in np.nonzero(po[:, k])[0]:
                    _visit(l)
                result.append(k)

            for k in range(po.shape[0]):
                _visit(k)
            return result

        reading_order = _topsort(partial_order)

        # Creating Reading Order object in PageXML
        order_group = OrderedGroupType(caption="Regions reading order",
                                       id=page_id)
        reading_order_object = ReadingOrderType()
        reading_order_object.set_OrderedGroup(order_group)
        page.set_ReadingOrder(reading_order_object)

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_y, min_x, max_y, max_x = r['rois'][i]
            score = r['scores'][i]
            class_id = r['class_ids'][i]
            class_name = CLASS_NAMES[class_id]
            if i in worse:
                LOG.debug(
                    "Ignoring instance %d[%s] overlapping better/larger neighbour",
                    i, class_name)
                continue

            if self.parameter['use_masks']:
                region_polygon = r['polygons'][i].exterior.coords[:-1]
            else:
                region_polygon = polygon_from_bbox(
                    max(min_x - 5, 0) if class_name == 'paragraph' else min_x,
                    min_y,
                    min(max_x +
                        10, width) if class_name == 'paragraph' else max_x,
                    max_y)

            # convert to absolute coordinates
            region_polygon = coordinates_for_segment(region_polygon,
                                                     page_image, page_xywh)
            # intersect with parent and plausibilize
            cut_region_polygon = Polygon(region_polygon)
            if border_polygon:
                cut_region_polygon = border_polygon.intersection(
                    cut_region_polygon)
            if cut_region_polygon.is_empty:
                LOG.warning('region %d does not intersect page frame', i)
                continue
            if not cut_region_polygon.is_valid:
                LOG.warning('region %d has invalid polygon', i)
                continue
            region_polygon = cut_region_polygon.exterior.coords[:-1]
            region_coords = CoordsType(points_from_polygon(region_polygon),
                                       conf=score)
            read_order = reading_order.index(i)
            region_args = {
                'custom': 'readingOrder {index:' + str(read_order) + ';}',
                'id': 'region%04d' % i,
                'Coords': region_coords
            }
            if class_name == 'image':
                image_region = ImageRegionType(**region_args)
                page.add_ImageRegion(image_region)
            elif class_name == 'table':
                table_region = TableRegionType(**region_args)
                page.add_TableRegion(table_region)
            elif class_name == 'graphics':
                graphic_region = GraphicRegionType(**region_args)
                page.add_GraphicRegion(graphic_region)
            else:
                region_args['type_'] = class_name
                textregion = TextRegionType(**region_args)
                page.add_TextRegion(textregion)
            order_index = reading_order.index(i)
            regionRefIndex = RegionRefIndexedType(index=order_index,
                                                  regionRef=region_args['id'])
            order_group.add_RegionRefIndexed(regionRefIndex)
            LOG.info('added %s region on page "%s"', class_name, page_id)
    def process(self):
        """Performs region segmentation by reading from COCO annotations.
        
        Open and deserialize the COCO JSON file from the second input file group.
        (It lists region categories/subtypes, file names and segmentations for all pages.)
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group. Now find this page in COCO:
        - try to match the PAGE ``imageFilename`` or METS file path matches to some
          COCO ``file_name``, otherwise
        - try to match the numeric part of the METS physical page ID to some
          COCO ``id``, otherwise
        - skip with an error.
        
        Then create and add a region for each ``segmentation``, converting its polygon
        to coordinate points and its COCO category to a region type (and subtype),
        either for a PubLayNet classification or PAGE classification (as produced by
        ocrd-segment-extract-pages), as indicated by ``source``.
        
        Produce a new output file by serialising the resulting hierarchy.
        
        Afterwards, if there are still COCO images left unaccounted for (i.e. without
        corresponding input files), then show a warning.
        """
        LOG = getLogger('processor.ImportCOCOSegmentation')
        # Load JSON
        assert_file_grp_cardinality(self.input_file_grp, 2, 'base and COCO')
        # pylint: disable=attribute-defined-outside-init
        self.input_file_grp, coco_grp = self.input_file_grp.split(',')
        # pylint: disable=attribute-defined-outside-init
        if not self.input_files:
            LOG.warning('No input files to process')
            return
        if coco_grp in self.workspace.mets.file_groups:
            try:
                cocofile = next(
                    f for f in self.workspace.mets.find_files(fileGrp=coco_grp)
                    # if f.mimetype == 'application/json' and not f.pageId
                    if not f.pageId)
            except StopIteration:
                raise Exception(
                    "no non-page-specific file in second file group (COCO file)",
                    coco_grp)
            cocofile = self.workspace.download_file(cocofile).local_filename
        elif os.path.isfile(coco_grp):
            cocofile = coco_grp
        else:
            raise Exception("file not found in second file group (COCO file)",
                            coco_grp)

        LOG.info('Loading COCO annotations from "%s" into memory...', cocofile)
        with open(cocofile, 'r') as inp:
            coco = json.load(inp)
        LOG.info('Loaded JSON for %d images with %d regions in %d categories',
                 len(coco['images']), len(coco['annotations']),
                 len(coco['categories']))
        coco_source = 'PubLayNet'
        # Convert to usable dicts
        # classes:
        categories = dict()
        subcategories = dict()
        for cat in coco['categories']:
            if cat['source'] == 'PAGE':
                coco_source = 'PAGE'
            if 'supercategory' in cat and cat['supercategory']:
                categories[cat['id']] = cat['supercategory']
                subcategories[cat['id']] = cat['name']
            else:
                categories[cat['id']] = cat['name']
        # images and annotations:
        images_by_id = dict()
        images_by_filename = dict()
        for image in coco['images']:
            images_by_id[image['id']] = image
            images_by_filename[image['file_name']] = image
        for annotation in coco['annotations']:
            image = images_by_id[annotation['image_id']]
            regions = image.setdefault('regions', list())
            regions.append(annotation)
        del coco

        LOG.info('Converting %s annotations into PAGE-XML', coco_source)
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()

            # find COCO image
            if page.imageFilename in images_by_filename:
                image = images_by_filename[page.imageFilename]
            elif num_page_id in images_by_id:
                image = images_by_id[num_page_id]
            else:
                LOG.error('Page "%s" / file "%s" not found in COCO', page_id,
                          page.imageFilename)
                # todo: maybe we should at least write the (unchanged) output PAGE?
                continue
            if image['width'] != page.imageWidth:
                LOG.error(
                    'Page "%s" width %d does not match annotated width %d',
                    page_id, page.imageWidth, image['width'])
            if image['height'] != page.imageHeight:
                LOG.error(
                    'Page "%s" height %d does not match annotated height %d',
                    page_id, page.imageHeight, image['height'])

            # todo: remove existing segmentation first?
            for region in image['regions']:
                assert isinstance(
                    region['segmentation'],
                    list), "importing RLE/mask segmentation not implemented"
                polygon = np.array(region['segmentation'])
                polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2))
                coords = CoordsType(points=points_from_polygon(polygon))
                category = categories[region['category_id']]
                if region['category_id'] in subcategories:
                    subcategory = subcategories[region['category_id']]
                else:
                    subcategory = None
                region_id = 'r' + str(region['id'])
                LOG.info('Adding region %s:%s [area %d]', category, subcategory
                         or '', region['area'])
                if coco_source == 'PubLayNet':
                    if category == 'text':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.PARAGRAPH)
                        page.add_TextRegion(region_obj)
                    elif category == 'title':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.HEADING)  # CAPTION?
                        page.add_TextRegion(region_obj)
                    elif category == 'list':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.LISTLABEL)  # OTHER?
                        page.add_TextRegion(region_obj)
                    elif category == 'table':
                        region_obj = TableRegionType(id=region_id,
                                                     Coords=coords)
                        page.add_TableRegion(region_obj)
                    elif category == 'figure':
                        region_obj = ImageRegionType(id=region_id,
                                                     Coords=coords)
                        page.add_ImageRegion(region_obj)
                    else:
                        raise Exception('unknown region category: %s' %
                                        category)
                else:  # 'PAGE'
                    args = {'id': region_id, 'Coords': coords}
                    if subcategory:
                        typedict = {
                            "TextRegion": TextTypeSimpleType,
                            "GraphicRegion": GraphicsTypeSimpleType,
                            "ChartType": ChartTypeSimpleType
                        }
                        if category in typedict:
                            subtype = membername(typedict[category],
                                                 subcategory)
                            if subtype == subcategory:
                                # not predefined in PAGE: use other + custom
                                args['custom'] = "subtype:%s" % subcategory
                                args['type_'] = "other"
                            else:
                                args['type_'] = subcategory
                        else:
                            args['custom'] = "subtype:%s" % subcategory
                    if category + 'Type' not in globals():
                        raise Exception('unknown region category: %s' %
                                        category)
                    region_type = globals()[category + 'Type']
                    if region_type is BorderType:
                        page.set_Border(BorderType(Coords=coords))
                    else:
                        region_obj = region_type(**args)
                        getattr(page, 'add_%s' % category)(region_obj)
            # remove image from dicts
            images_by_id.pop(num_page_id, None)
            images_by_filename.pop(page.imageFilename, None)

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))

        # warn of remaining COCO images
        if images_by_filename and not self.page_id:
            LOG.warning('%d images remain unaccounted for after processing',
                        len(images_by_filename))
            if LOG.isEnabledFor(logging.DEBUG):
                for filename in images_by_filename:
                    LOG.debug('not found in workspace: "%s"', filename)
Ejemplo n.º 15
0
    def build_pagexml_no_full_layout(
            self, found_polygons_text_region, page_coord, order_of_texts,
            id_of_texts, all_found_texline_polygons, all_box_coord,
            found_polygons_text_region_img, found_polygons_marginals,
            all_found_texline_polygons_marginals, all_box_coord_marginals,
            slopes, slopes_marginals, cont_page,
            polygons_lines_to_be_written_in_xml, found_polygons_tables):
        self.logger.debug('enter build_pagexml_no_full_layout')

        # create the file structure
        pcgts = self.pcgts if self.pcgts else create_page_xml(
            self.image_filename, self.height_org, self.width_org)
        page = pcgts.get_Page()
        page.set_Border(
            BorderType(Coords=CoordsType(
                points=self.calculate_page_coords(cont_page))))

        counter = EynollahIdCounter()
        if len(found_polygons_text_region) > 0:
            _counter_marginals = EynollahIdCounter(
                region_idx=len(order_of_texts))
            id_of_marginalia = [
                _counter_marginals.next_region_id
                for _ in found_polygons_marginals
            ]
            xml_reading_order(page, order_of_texts, id_of_marginalia)

        for mm in range(len(found_polygons_text_region)):
            textregion = TextRegionType(
                id=counter.next_region_id,
                type_='paragraph',
                Coords=CoordsType(points=self.calculate_polygon_coords(
                    found_polygons_text_region[mm], page_coord)),
            )
            page.add_TextRegion(textregion)
            self.serialize_lines_in_region(textregion,
                                           all_found_texline_polygons, mm,
                                           page_coord, all_box_coord, slopes,
                                           counter)

        for mm in range(len(found_polygons_marginals)):
            marginal = TextRegionType(
                id=counter.next_region_id,
                type_='marginalia',
                Coords=CoordsType(points=self.calculate_polygon_coords(
                    found_polygons_marginals[mm], page_coord)))
            page.add_TextRegion(marginal)
            self.serialize_lines_in_marginal(
                marginal, all_found_texline_polygons_marginals, mm, page_coord,
                all_box_coord_marginals, slopes_marginals, counter)

        for mm in range(len(found_polygons_text_region_img)):
            img_region = ImageRegionType(id=counter.next_region_id,
                                         Coords=CoordsType())
            page.add_ImageRegion(img_region)
            points_co = ''
            for lmm in range(len(found_polygons_text_region_img[mm])):
                points_co += str(
                    int((found_polygons_text_region_img[mm][lmm, 0, 0] +
                         page_coord[2]) / self.scale_x))
                points_co += ','
                points_co += str(
                    int((found_polygons_text_region_img[mm][lmm, 0, 1] +
                         page_coord[0]) / self.scale_y))
                points_co += ' '
            img_region.get_Coords().set_points(points_co[:-1])

        for mm in range(len(polygons_lines_to_be_written_in_xml)):
            sep_hor = SeparatorRegionType(id=counter.next_region_id,
                                          Coords=CoordsType())
            page.add_SeparatorRegion(sep_hor)
            points_co = ''
            for lmm in range(len(polygons_lines_to_be_written_in_xml[mm])):
                points_co += str(
                    int((polygons_lines_to_be_written_in_xml[mm][lmm, 0, 0]) /
                        self.scale_x))
                points_co += ','
                points_co += str(
                    int((polygons_lines_to_be_written_in_xml[mm][lmm, 0, 1]) /
                        self.scale_y))
                points_co += ' '
            sep_hor.get_Coords().set_points(points_co[:-1])
        for mm in range(len(found_polygons_tables)):
            tab_region = TableRegionType(id=counter.next_region_id,
                                         Coords=CoordsType())
            page.add_TableRegion(tab_region)
            points_co = ''
            for lmm in range(len(found_polygons_tables[mm])):
                points_co += str(
                    int((found_polygons_tables[mm][lmm, 0, 0] + page_coord[2])
                        / self.scale_x))
                points_co += ','
                points_co += str(
                    int((found_polygons_tables[mm][lmm, 0, 1] + page_coord[0])
                        / self.scale_y))
                points_co += ' '
            tab_region.get_Coords().set_points(points_co[:-1])

        return pcgts
Ejemplo n.º 16
0
    def _process_element(self,
                         element,
                         ignore,
                         image,
                         coords,
                         element_id,
                         file_id,
                         page_id,
                         zoom=1.0,
                         rogroup=None):
        """Add PAGE layout elements by segmenting an image.

        Given a PageType, TableRegionType or TextRegionType ``element``, and
        a corresponding binarized PIL.Image object ``image`` with coordinate
        metadata ``coords``, run line segmentation with Ocropy.
        
        If operating on the full page (or table), then also detect horizontal
        and vertical separators, and aggregate the lines into text regions
        afterwards.
        
        Add the resulting sub-segments to the parent ``element``.
        
        If ``ignore`` is not empty, then first suppress all foreground components
        in any of those segments' coordinates during segmentation, and if also
        in full page/table mode, then combine all separators among them with the
        newly detected separators to guide region segmentation.
        """
        LOG = getLogger('processor.OcropySegment')
        if not image.width or not image.height:
            LOG.warning("Skipping '%s' with zero size", element_id)
            return
        element_array = pil2array(image)
        element_bin = np.array(element_array <= midrange(element_array),
                               np.bool)
        sep_bin = np.zeros_like(element_bin, np.bool)
        ignore_labels = np.zeros_like(element_bin, np.int)
        for i, segment in enumerate(ignore):
            LOG.debug('masking foreground of %s "%s" for "%s"',
                      type(segment).__name__[:-4], segment.id, element_id)
            # mark these segments (e.g. separator regions, tables, images)
            # for workflows where they have been detected already;
            # these will be:
            # - ignored during text line segmentation (but not h/v-line detection)
            # - kept and reading-ordered during region segmentation (but not seps)
            segment_polygon = coordinates_of_segment(segment, image, coords)
            # If segment_polygon lies outside of element (causing
            # negative/above-max indices), either fully or partially,
            # then this will silently ignore them. The caller does
            # not need to concern herself with this.
            if isinstance(segment, SeparatorRegionType):
                sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                            0],
                                     sep_bin.shape)] = True
            ignore_labels[draw.polygon(
                segment_polygon[:, 1], segment_polygon[:, 0],
                ignore_labels.shape)] = i + 1  # mapped back for RO
        if isinstance(element, PageType):
            element_name = 'page'
            fullpage = True
            report = check_page(element_bin, zoom)
        elif isinstance(element, TableRegionType) or (
                # sole/congruent text region of a table region?
                element.id.endswith('_text')
                and isinstance(element.parent_object_, TableRegionType)):
            element_name = 'table'
            fullpage = True
            report = check_region(element_bin, zoom)
        else:
            element_name = 'region'
            fullpage = False
            report = check_region(element_bin, zoom)
        LOG.info('computing line segmentation for %s "%s"', element_name,
                 element_id)
        # TODO: we should downscale if DPI is large enough to save time
        try:
            if report:
                raise Exception(report)
            line_labels, hlines, vlines, images, colseps, scale = compute_segmentation(
                # suppress separators and ignored regions for textline estimation
                # but keep them for h/v-line detection (in fullpage mode):
                element_bin,
                seps=(sep_bin + ignore_labels) > 0,
                zoom=zoom,
                fullpage=fullpage,
                spread_dist=round(self.parameter['spread'] / zoom * 300 /
                                  72),  # in pt
                # these are ignored when not in fullpage mode:
                maxcolseps=self.parameter['maxcolseps'],
                maxseps=self.parameter['maxseps'],
                maximages=self.parameter['maximages']
                if element_name != 'table' else 0,
                csminheight=self.parameter['csminheight'],
                hlminwidth=self.parameter['hlminwidth'])
        except Exception as err:
            if isinstance(element, TextRegionType):
                LOG.error('Cannot line-segment region "%s": %s', element_id,
                          err)
                # as a fallback, add a single text line comprising the whole region:
                element.add_TextLine(
                    TextLineType(id=element_id + "_line",
                                 Coords=element.get_Coords()))
            else:
                LOG.error('Cannot line-segment %s "%s": %s', element_name,
                          element_id, err)
            return

        LOG.info('Found %d text lines for %s "%s"',
                 len(np.unique(line_labels)) - 1, element_name, element_id)
        # post-process line labels
        if isinstance(element, (PageType, TableRegionType)):
            # aggregate text lines to text regions
            try:
                # pass ignored regions as "line labels with initial assignment",
                # i.e. identical line and region labels
                # to detect their reading order among the others
                # (these cannot be split or grouped together with other regions)
                line_labels = np.where(line_labels, line_labels + len(ignore),
                                       ignore_labels)
                # suppress separators/images in fg and try to use for partitioning slices
                sepmask = np.maximum(np.maximum(hlines, vlines),
                                     np.maximum(sep_bin, images))
                region_labels = lines2regions(
                    element_bin,
                    line_labels,
                    rlabels=ignore_labels,
                    sepmask=np.maximum(sepmask, colseps),  # add bg
                    # decide horizontal vs vertical cut when gaps of similar size
                    prefer_vertical=not isinstance(element, TableRegionType),
                    gap_height=self.parameter['gap_height'],
                    gap_width=self.parameter['gap_width'],
                    scale=scale,
                    zoom=zoom)
                LOG.info('Found %d text regions for %s "%s"',
                         len(np.unique(region_labels)) - 1, element_name,
                         element_id)
            except Exception as err:
                LOG.error('Cannot region-segment %s "%s": %s', element_name,
                          element_id, err)
                region_labels = np.where(line_labels > len(ignore),
                                         1 + len(ignore), line_labels)

            # prepare reading order group index
            if rogroup:
                if isinstance(rogroup,
                              (OrderedGroupType, OrderedGroupIndexedType)):
                    index = 0
                    # start counting from largest existing index
                    for elem in (rogroup.get_RegionRefIndexed() +
                                 rogroup.get_OrderedGroupIndexed() +
                                 rogroup.get_UnorderedGroupIndexed()):
                        if elem.index >= index:
                            index = elem.index + 1
                else:
                    index = None
            # find contours around region labels (can be non-contiguous):
            region_no = 0
            for region_label in np.unique(region_labels):
                if not region_label:
                    continue  # no bg
                region_mask = region_labels == region_label
                region_line_labels = line_labels * region_mask
                region_line_labels0 = np.setdiff1d(region_line_labels, [0])
                if not np.all(region_line_labels0 > len(ignore)):
                    # existing region from `ignore` merely to be ordered
                    # (no new region, no actual text lines)
                    region_line_labels0 = np.intersect1d(
                        region_line_labels0, ignore_labels)
                    assert len(region_line_labels0) == 1, \
                        "region label %d has both existing regions and new lines (%s)" % (
                            region_label, str(region_line_labels0))
                    region = ignore[region_line_labels0[0] - 1]
                    if rogroup and region.parent_object_ == element and not isinstance(
                            region, SeparatorRegionType):
                        index = page_add_to_reading_order(
                            rogroup, region.id, index)
                    LOG.debug('Region label %d is for ignored region "%s"',
                              region_label, region.id)
                    continue
                # normal case: new lines inside new regions
                # remove binary-empty labels, and re-order locally
                order = morph.reading_order(region_line_labels)
                order[np.setdiff1d(region_line_labels0,
                                   element_bin * region_line_labels)] = 0
                region_line_labels = order[region_line_labels]
                # avoid horizontal gaps
                region_line_labels = hmerge_line_seeds(element_bin,
                                                       region_line_labels,
                                                       scale,
                                                       seps=np.maximum(
                                                           sepmask, colseps))
                region_mask |= region_line_labels > 0
                # find contours for region (can be non-contiguous)
                regions, _ = masks2polygons(
                    region_mask * region_label,
                    element_bin,
                    '%s "%s"' % (element_name, element_id),
                    min_area=6000 / zoom / zoom,
                    simplify=ignore_labels * ~(sep_bin))
                # find contours for lines (can be non-contiguous)
                lines, _ = masks2polygons(region_line_labels,
                                          element_bin,
                                          'region "%s"' % element_id,
                                          min_area=640 / zoom / zoom)
                # create new lines in new regions (allocating by intersection)
                line_polys = [Polygon(polygon) for _, polygon in lines]
                for _, region_polygon in regions:
                    region_poly = prep(Polygon(region_polygon))
                    # convert back to absolute (page) coordinates:
                    region_polygon = coordinates_for_segment(
                        region_polygon, image, coords)
                    region_polygon = polygon_for_parent(
                        region_polygon, element)
                    if region_polygon is None:
                        LOG.warning(
                            'Ignoring extant region contour for region label %d',
                            region_label)
                        continue
                    # annotate result:
                    region_no += 1
                    region_id = element_id + "_region%04d" % region_no
                    LOG.debug('Region label %d becomes ID "%s"', region_label,
                              region_id)
                    region = TextRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon)))
                    # find out which line (contours) belong to which region (contours)
                    line_no = 0
                    for i, line_poly in enumerate(line_polys):
                        if not region_poly.intersects(line_poly):  # .contains
                            continue
                        line_label, line_polygon = lines[i]
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, image, coords)
                        line_polygon = polygon_for_parent(line_polygon, region)
                        if line_polygon is None:
                            LOG.warning(
                                'Ignoring extant line contour for region label %d line label %d',
                                region_label, line_label)
                            continue
                        # annotate result:
                        line_no += 1
                        line_id = region_id + "_line%04d" % line_no
                        LOG.debug('Line label %d becomes ID "%s"', line_label,
                                  line_id)
                        line = TextLineType(
                            id=line_id,
                            Coords=CoordsType(
                                points=points_from_polygon(line_polygon)))
                        region.add_TextLine(line)
                    # if the region has received text lines, keep it
                    if region.get_TextLine():
                        element.add_TextRegion(region)
                        LOG.info('Added region "%s" with %d lines for %s "%s"',
                                 region_id, line_no, element_name, element_id)
                        if rogroup:
                            index = page_add_to_reading_order(
                                rogroup, region.id, index)
            # add additional image/non-text regions from compute_segmentation
            # (e.g. drop-capitals or images) ...
            image_labels, num_images = morph.label(images)
            LOG.info('Found %d large non-text/image regions for %s "%s"',
                     num_images, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            image_polygons, _ = masks2polygons(
                image_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for image_label, polygon in image_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning(
                        'Ignoring extant region contour for image label %d',
                        image_label)
                    continue
                region_no += 1
                # annotate result:
                region_id = element_id + "_image%04d" % region_no
                element.add_ImageRegion(
                    ImageRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # split rulers into separator regions:
            hline_labels, num_hlines = morph.label(hlines)
            vline_labels, num_vlines = morph.label(vlines)
            LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines,
                     num_vlines, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            hline_polygons, _ = masks2polygons(
                hline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            vline_polygons, _ = masks2polygons(
                vline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for _, polygon in hline_polygons + vline_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning('Ignoring extant region contour for separator')
                    continue
                # annotate result:
                region_no += 1
                region_id = element_id + "_sep%04d" % region_no
                element.add_SeparatorRegion(
                    SeparatorRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # annotate a text/image-separated image
            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
        else:
            # get mask from region polygon:
            region_polygon = coordinates_of_segment(element, image, coords)
            region_mask = np.zeros_like(element_bin, np.bool)
            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:,
                                                                          0],
                                     region_mask.shape)] = True
            # ensure the new line labels do not extrude from the region:
            line_labels = line_labels * region_mask
            # find contours around labels (can be non-contiguous):
            line_polygons, _ = masks2polygons(line_labels,
                                              element_bin,
                                              'region "%s"' % element_id,
                                              min_area=640 / zoom / zoom)
            line_no = 0
            for line_label, polygon in line_polygons:
                # convert back to absolute (page) coordinates:
                line_polygon = coordinates_for_segment(polygon, image, coords)
                line_polygon = polygon_for_parent(line_polygon, element)
                if line_polygon is None:
                    LOG.warning(
                        'Ignoring extant line contour for line label %d',
                        line_label)
                    continue
                # annotate result:
                line_no += 1
                line_id = element_id + "_line%04d" % line_no
                element.add_TextLine(
                    TextLineType(
                        id=line_id,
                        Coords=CoordsType(
                            points=points_from_polygon(line_polygon))))
            if not sep_bin.any():
                return  # no derived image
            # annotate a text/image-separated image
            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            # update PAGE (reference the image file):
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
Ejemplo n.º 17
0
    def _process_segment(self, page_image, page, region_xywh, page_id,
                         input_file, n):
        binary = ocrolib.pil2array(page_image)
        binary = np.array(1 - binary / np.amax(binary), 'B')
        if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1:
            min_x, max_x = (0, binary.shape[0])
            min_y, max_y = (0, binary.shape[1])
            textregion = TextRegionType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            page.add_TextRegion(textregion)
        else:
            textregion = page.get_TextRegion()[-1]
        ocrolib.write_image_binary("test.bin.png", binary)
        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']
        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale))
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("%s: too many lines %i",
                        (fname, np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])
        region_xywh['features'] += ",textline"
        for i, l in enumerate(lines):
            ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0],
                                                              l.bounds[1]])
            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            img = binary[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = ocrolib.array2pil(img)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line = TextLineType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)
Ejemplo n.º 18
0
def test_deskewing(plain_workspace):
    #from ocrd_utils import initLogging, setOverrideLogLevel
    #setOverrideLogLevel('DEBUG')
    size = (3000, 4000)
    poly = [[1403, 2573], [1560, 2573], [1560, 2598], [2311,
                                                       2598], [2311, 2757],
            [2220, 2757], [2220, 2798], [2311, 2798], [2311, 2908],
            [1403, 2908]]
    xywh = xywh_from_polygon(poly)
    bbox = bbox_from_polygon(poly)
    skew = 4.625
    image = Image.new('L', size)
    image = polygon_mask(image, poly)
    #image.show(title='image')
    pixels = np.count_nonzero(np.array(image) > 0)
    name = 'foo0'
    assert plain_workspace.save_image_file(image, name, 'IMG')
    pcgts = page_from_file(next(plain_workspace.mets.find_files(ID=name)))
    page = pcgts.get_Page()
    region = TextRegionType(
        id='nonrect',
        Coords=CoordsType(points=points_from_polygon(poly)),
        orientation=-skew)
    page.add_TextRegion(region)
    page_image, page_coords, _ = plain_workspace.image_from_page(page, '')
    #page_image.show(title='page_image')
    assert list(image.getdata()) == list(page_image.getdata())
    assert np.all(page_coords['transform'] == np.eye(3))
    reg_image, reg_coords = plain_workspace.image_from_segment(
        region, page_image, page_coords, feature_filter='deskewed', fill=0)
    assert list(image.crop(bbox).getdata()) == list(reg_image.getdata())
    assert reg_image.width == xywh['w'] == 908
    assert reg_image.height == xywh['h'] == 335
    assert reg_coords['transform'][0, 2] == -xywh['x']
    assert reg_coords['transform'][1, 2] == -xywh['y']
    # same fg after cropping to minimal bbox
    reg_pixels = np.count_nonzero(np.array(reg_image) > 0)
    assert pixels == reg_pixels
    # now with deskewing (test for size after recropping)
    reg_image, reg_coords = plain_workspace.image_from_segment(region,
                                                               page_image,
                                                               page_coords,
                                                               fill=0)
    #reg_image.show(title='reg_image')
    assert reg_image.width == 932 > xywh['w']
    assert reg_image.height == 382 > xywh['h']
    assert reg_coords['transform'][0, 1] != 0
    assert reg_coords['transform'][1, 0] != 0
    assert 'deskewed' in reg_coords['features']
    # same fg after cropping to minimal bbox (roughly - due to aliasing)
    reg_pixels = np.count_nonzero(np.array(reg_image) > 0)
    assert np.abs(pixels - reg_pixels) / pixels < 0.005
    reg_array = np.array(reg_image) > 0
    # now via AlternativeImage
    path = plain_workspace.save_image_file(reg_image, region.id + '_img',
                                           'IMG')
    region.add_AlternativeImage(
        AlternativeImageType(filename=path, comments=reg_coords['features']))
    logger_capture = FIFOIO(256)
    logger_handler = logging.StreamHandler(logger_capture)
    #logger_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
    logger = logging.getLogger('ocrd_utils.crop_image')
    logger.addHandler(logger_handler)
    reg_image2, reg_coords2 = plain_workspace.image_from_segment(region,
                                                                 page_image,
                                                                 page_coords,
                                                                 fill=0)
    #reg_image2.show(title='reg_image2')
    logger_output = logger_capture.getvalue()
    logger_capture.close()
    assert logger_output == ''
    assert reg_image2.width == reg_image.width
    assert reg_image2.height == reg_image.height
    assert np.allclose(reg_coords2['transform'], reg_coords['transform'])
    assert reg_coords2['features'] == reg_coords['features']
    # same fg after cropping to minimal bbox (roughly - due to aliasing)
    reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0)
    assert reg_pixels2 == reg_pixels
    reg_array2 = np.array(reg_image2) > 0
    assert 0.98 < np.sum(reg_array == reg_array2) / reg_array.size <= 1.0
Ejemplo n.º 19
0
 def _process_page(self, it, page, page_image, page_coords, page_id):
     LOG = getLogger('processor.TesserocrSegmentRegion')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     # and its BlockPolygon()
     index = 0
     ro = page.get_ReadingOrder()
     if not ro:
         ro = ReadingOrderType()
         page.set_ReadingOrder(ro)
     og = ro.get_OrderedGroup()
     if og:
         # start counting from largest existing index
         for elem in (og.get_RegionRefIndexed() +
                      og.get_OrderedGroupIndexed() +
                      og.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     else:
         # new top-level group
         og = OrderedGroupType(id="reading-order")
         ro.set_OrderedGroup(og)
     while it and not it.Empty(RIL.BLOCK):
         # (padding will be passed to both BoundingBox and GetImage)
         # (actually, Tesseract honours padding only on the left and bottom,
         #  whereas right and top are increased less!)
         bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
         # sometimes these polygons are not planar, which causes
         # PIL.ImageDraw.Draw.polygon (and likely others as well)
         # to misbehave; however, PAGE coordinate semantics prohibit
         # multi-path polygons!
         # (probably a bug in Tesseract itself, cf. tesseract#2826):
         if self.parameter['crop_polygons']:
             polygon = it.BlockPolygon()
         else:
             polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, page_image, page_coords)
         polygon2 = polygon_for_parent(polygon, page)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         if polygon2 is None:
             LOG.info('Ignoring extant region: %s', points)
             it.Next(RIL.BLOCK)
             continue
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         # region_image_bin = it.GetBinaryImage(RIL.BLOCK)
         # if not region_image_bin.getbbox():
         #     LOG.info('Ignoring binary-empty region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (will be removed again if Separator/Noise region below)
         ID = "region%04d" % index
         og.add_RegionRefIndexed(
             RegionRefIndexedType(regionRef=ID, index=index))
         #
         # region type switch
         #
         block_type = it.BlockType()
         if block_type in [
                 PT.FLOWING_TEXT,
                 PT.HEADING_TEXT,
                 PT.PULLOUT_TEXT,
                 PT.CAPTION_TEXT,
                 # TABLE is contained in PTIsTextType, but
                 # it is a bad idea to create a TextRegion
                 # for it (better set `find_tables` False):
                 # PT.TABLE,
                 # will also get a 90° @orientation
                 # (but that can be overridden by deskew/OSD):
                 PT.VERTICAL_TEXT
         ]:
             region = TextRegionType(id=ID,
                                     Coords=coords,
                                     type=TextTypeSimpleType.PARAGRAPH)
             if block_type == PT.VERTICAL_TEXT:
                 region.set_orientation(90.0)
             elif block_type == PT.HEADING_TEXT:
                 region.set_type(TextTypeSimpleType.HEADING)
             elif block_type == PT.PULLOUT_TEXT:
                 region.set_type(TextTypeSimpleType.FLOATING)
             elif block_type == PT.CAPTION_TEXT:
                 region.set_type(TextTypeSimpleType.CAPTION)
             page.add_TextRegion(region)
             if self.parameter['sparse_text']:
                 region.set_type(TextTypeSimpleType.OTHER)
                 region.add_TextLine(
                     TextLineType(id=region.id + '_line', Coords=coords))
         elif block_type in [
                 PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE
         ]:
             region = ImageRegionType(id=ID, Coords=coords)
             page.add_ImageRegion(region)
         elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]:
             region = SeparatorRegionType(id=ID, Coords=coords)
             page.add_SeparatorRegion(region)
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]:
             region = MathsRegionType(id=ID, Coords=coords)
             page.add_MathsRegion(region)
         elif block_type == PT.TABLE:
             # without API access to StructuredTable we cannot
             # do much for a TableRegionType (i.e. nrows, ncols,
             # coordinates of cells for recursive regions etc),
             # but this can be achieved afterwards by segment-table
             region = TableRegionType(id=ID, Coords=coords)
             page.add_TableRegion(region)
         else:
             region = NoiseRegionType(id=ID, Coords=coords)
             page.add_NoiseRegion()
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         LOG.info("Detected region '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
     if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed()
             and not og.get_UnorderedGroupIndexed()):
         # schema forbids empty OrderedGroup
         ro.set_OrderedGroup(None)
Ejemplo n.º 20
0
def convert(cocofile, directory):
    """Convert MS-COCO JSON to METS/PAGE XML files.
    
    Load JSON ``cocofile`` (in MS-COCO format)
    and chdir to ``directory`` (which it refers to).
    
    Start a METS file mets.xml with references to
    the image files (under fileGrp ``OCR-D-IMG``)
    and their corresponding PAGE-XML annotations
    (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as
    parsed from ``cocofile`` and written using
    the same basename.
    """
    resolver = Resolver()
    with pushd_popd(directory):
        workspace = resolver.workspace_from_nothing('.')
        # https://github.com/ibm-aur-nlp/PubLayNet
        workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory
        coco = json.load(cocofile)
        LOG.info('Loaded JSON for %d images with %d regions in %d categories',
                 len(coco['images']), len(coco['annotations']),
                 len(coco['categories']))
        categories = dict()
        for cat in coco['categories']:
            categories[cat['id']] = cat['name']
        images = dict()
        for image in coco['images']:
            images[image['id']] = image
        for annotation in coco['annotations']:
            image = images[annotation['image_id']]
            regions = image.setdefault('regions', list())
            regions.append(annotation)
        del coco
        LOG.info('Parsing annotations into PAGE-XML')
        for image in images.values():
            page_id = 'p' + str(image['id'])
            file_base, file_ext = os.path.splitext(image['file_name'])
            filename = file_base + '.xml'
            image_file = workspace.add_file('OCR-D-IMG',
                                            ID='OCR-D-IMG_' + page_id,
                                            pageId=page_id,
                                            mimetype=EXT_TO_MIME[file_ext],
                                            local_filename=image['file_name'])
            LOG.info('Added page %s file %s of type %s', image_file.pageId,
                     image_file.local_filename, image_file.mimetype)
            pcgts = page_from_image(image_file)
            pcgts.set_pcGtsId(page_id)
            page = pcgts.get_Page()
            assert page.imageWidth == image['width']
            assert page.imageHeight == image['height']
            for region in image['regions']:
                polygon = np.array(region['segmentation'])
                polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2))
                coords = CoordsType(points=points_from_polygon(polygon))
                category = categories[region['category_id']]
                region_id = 'r' + str(region['id'])
                if category == 'text':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.PARAGRAPH)
                    page.add_TextRegion(region_obj)
                elif category == 'title':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.HEADING)  # CAPTION?
                    page.add_TextRegion(region_obj)
                elif category == 'list':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.LISTLABEL)  # OTHER?
                    page.add_TextRegion(region_obj)
                elif category == 'table':
                    region_obj = TableRegionType(id=region_id, Coords=coords)
                    page.add_TableRegion(region_obj)
                elif category == 'figure':
                    region_obj = ImageRegionType(id=region_id, Coords=coords)
                    page.add_ImageRegion(region_obj)
                else:
                    raise Exception('unknown image category: %s' % category)
            page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK',
                                           ID='OCR-D-GT-SEG-BLOCK_' + page_id,
                                           pageId=page_id,
                                           mimetype=MIMETYPE_PAGE,
                                           local_filename=filename,
                                           content=to_xml(pcgts))
            LOG.info('Added page %s file %s with %d regions', page_file.pageId,
                     page_file.local_filename, len(image['regions']))
        LOG.info('All done')
        workspace.save_mets()
Ejemplo n.º 21
0
    def process(self):
        """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested level.
        
        Depending on ``level-of-operation``, consider existing segments:
        - If ``overwrite_separators=True`` on ``page`` level, then
          delete any SeparatorRegions.
        - If ``overwrite_regions=True`` on ``page`` level, then
          delete any top-level TextRegions (along with ReadingOrder).
        - If ``overwrite_regions=True`` on ``table`` level, then
          delete any TextRegions in TableRegions (along with their OrderGroup).
        - If ``overwrite_lines=True`` on ``region`` level, then
          delete any TextLines in TextRegions.
        - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
          delete the reading order OrderedGroup entry corresponding
          to the (page/table) segment.
        
        Next, get each element image according to the layout annotation (from
        the alternative image of the page/region, or by cropping via coordinates
        into the higher-level image) in binarized form, and represent it as an array
        with non-text regions and (remaining) text neighbours suppressed.
        
        Then compute a text line segmentation for that array (as a label mask).
        When ``level-of-operation`` is ``page`` or ``table``, this also entails
        detecting
        - up to ``maximages`` large foreground images,
        - up to ``maxseps`` foreground h/v-line separators and
        - up to ``maxcolseps`` background column separators
        before text line segmentation itself, as well as aggregating text lines
        to text regions afterwards.
        
        Text regions are detected via a hybrid variant recursive X-Y cut algorithm
        (RXYC): RXYC partitions the binarized image in top-down manner by detecting
        horizontal or vertical gaps. This implementation uses the bottom-up text line
        segmentation to guide the search, and also uses both pre-existing and newly
        detected separators to alternatively partition the respective boxes into
        non-rectangular parts.
        
        During line segmentation, suppress the foreground of all previously annotated
        regions (of any kind) and lines, except if just removed due to ``overwrite``.
        During region aggregation however, combine the existing separators with the
        new-found separators to guide the column search.
        
        All detected segments (both text line and text region) are sorted according
        to their reading order (assuming a top-to-bottom, left-to-right ordering).
        When ``level-of-operation`` is ``page``, prefer vertical (column-first)
        succession of regions. When it is ``table``, prefer horizontal (row-first)
        succession of cells.
        
        Then for each resulting segment label, convert its background mask into
        polygon outlines by finding the outer contours consistent with the element's
        polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
        - If ``level-of-operation`` is ``region``, then append the new lines to the
          parent region.
        - If it is ``table``, then append the new lines to their respective regions,
          and append the new regions to the parent table.
          (Also, create an OrderedGroup for it as the parent's RegionRef.)
        - If it is ``page``, then append the new lines to their respective regions,
          and append the new regions to the page.
          (Also, create an OrderedGroup for it in the ReadingOrder.)
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropySegment')
        # FIXME: allow passing a-priori info on reading order / textline order
        # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
        #  of different scripts; also, vertical writing needs internal rotation
        #  because our line segmentation only works for horizontal writing)
        overwrite_lines = self.parameter['overwrite_lines']
        overwrite_regions = self.parameter['overwrite_regions']
        overwrite_separators = self.parameter['overwrite_separators']
        overwrite_order = self.parameter['overwrite_order']
        oplevel = self.parameter['level-of-operation']

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            # TODO: also allow grayscale_normalized (try/except?)
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            # aggregate existing regions so their foreground can be ignored
            ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() +
                      page.get_GraphicRegion() + page.get_ChartRegion() +
                      page.get_MapRegion() + page.get_MathsRegion() +
                      page.get_ChemRegion() + page.get_MusicRegion() +
                      page.get_AdvertRegion() + page.get_NoiseRegion() +
                      page.get_UnknownRegion() + page.get_CustomRegion())
            if oplevel == 'page' and overwrite_separators:
                page.set_SeparatorRegion([])
            else:
                ignore.extend(page.get_SeparatorRegion())
            # prepare reading order
            reading_order = dict()
            ro = page.get_ReadingOrder()
            if ro:
                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                if rogroup:
                    page_get_reading_order(reading_order, rogroup)

            # get segments to process / overwrite
            if oplevel == 'page':
                ignore.extend(page.get_TableRegion())
                regions = list(page.get_TextRegion())
                if regions:
                    # page is already region-segmented
                    if overwrite_regions:
                        LOG.info('removing existing TextRegions in page "%s"',
                                 page_id)
                        # we could remove all other region types as well,
                        # but this is more flexible (for workflows with
                        # specialized separator/image/table detectors):
                        page.set_TextRegion([])
                        page.set_ReadingOrder(None)
                        ro = None
                    else:
                        LOG.warning(
                            'keeping existing TextRegions in page "%s"',
                            page_id)
                        ignore.extend(regions)
                # create reading order if necessary
                if not ro or overwrite_order:
                    ro = ReadingOrderType()
                    page.set_ReadingOrder(ro)
                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                if not rogroup:
                    # new top-level group
                    rogroup = OrderedGroupType(id="reading-order")
                    ro.set_OrderedGroup(rogroup)
                # go get TextRegions with TextLines (and SeparatorRegions):
                self._process_element(page,
                                      ignore,
                                      page_image,
                                      page_coords,
                                      page_id,
                                      file_id,
                                      input_file.pageId,
                                      zoom,
                                      rogroup=rogroup)
                if (not rogroup.get_RegionRefIndexed()
                        and not rogroup.get_OrderedGroupIndexed()
                        and not rogroup.get_UnorderedGroupIndexed()):
                    # schema forbids empty OrderedGroup
                    ro.set_OrderedGroup(None)
            elif oplevel == 'table':
                ignore.extend(page.get_TextRegion())
                regions = list(page.get_TableRegion())
                if not regions:
                    LOG.warning('Page "%s" contains no table regions', page_id)
                for region in regions:
                    subregions = region.get_TextRegion()
                    if subregions:
                        # table is already cell-segmented
                        if overwrite_regions:
                            LOG.info(
                                'removing existing TextRegions in table "%s"',
                                region.id)
                            region.set_TextRegion([])
                            roelem = reading_order.get(region.id)
                            # replace by empty group with same index and ref
                            # (which can then take the cells as subregions)
                            reading_order[
                                region.id] = page_subgroup_in_reading_order(
                                    roelem)
                        else:
                            LOG.warning(
                                'skipping table "%s" with existing TextRegions',
                                region.id)
                            continue
                    # TODO: also allow grayscale_normalized (try/except?)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        feature_selector='binarized')
                    # ignore everything but the current table region
                    subignore = regions + ignore
                    subignore.remove(region)
                    # create reading order group if necessary
                    roelem = reading_order.get(region.id)
                    if not roelem:
                        LOG.warning(
                            "Page '%s' table region '%s' is not referenced in reading order (%s)",
                            page_id, region.id, "no target to add cells to")
                    elif overwrite_order:
                        # replace by empty ordered group with same (index and) ref
                        # (which can then take the cells as subregions)
                        roelem = page_subgroup_in_reading_order(roelem)
                        reading_order[region.id] = roelem
                    elif isinstance(
                            roelem,
                        (OrderedGroupType, OrderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an ordered group (%s)",
                            page_id, region.id, "cells will be appended")
                    elif isinstance(
                            roelem,
                        (UnorderedGroupType, UnorderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an unordered group (%s)",
                            page_id, region.id, "cells will not be appended")
                        roelem = None
                    else:
                        # replace regionRef(Indexed) by group with same index and ref
                        # (which can then take the cells as subregions)
                        roelem = page_subgroup_in_reading_order(roelem)
                        reading_order[region.id] = roelem
                    # go get TextRegions with TextLines (and SeparatorRegions)
                    self._process_element(region,
                                          subignore,
                                          region_image,
                                          region_coords,
                                          region.id,
                                          file_id + '_' + region.id,
                                          input_file.pageId,
                                          zoom,
                                          rogroup=roelem)
            else:  # 'region'
                regions = list(page.get_TextRegion())
                # besides top-level text regions, line-segment any table cells,
                # and for tables without any cells, add a pseudo-cell
                for region in page.get_TableRegion():
                    subregions = region.get_TextRegion()
                    if subregions:
                        regions.extend(subregions)
                    else:
                        subregion = TextRegionType(
                            id=region.id + '_text',
                            Coords=region.get_Coords(),
                            # as if generated from parser:
                            parent_object_=region)
                        region.add_TextRegion(subregion)
                        regions.append(subregion)
                if not regions:
                    LOG.warning('Page "%s" contains no text regions', page_id)
                for region in regions:
                    if region.get_TextLine():
                        if overwrite_lines:
                            LOG.info(
                                'removing existing TextLines in page "%s" region "%s"',
                                page_id, region.id)
                            region.set_TextLine([])
                        else:
                            LOG.warning(
                                'keeping existing TextLines in page "%s" region "%s"',
                                page_id, region.id)
                            ignore.extend(region.get_TextLine())
                    # TODO: also allow grayscale_normalized (try/except?)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        feature_selector='binarized')
                    # if the region images have already been clipped against their neighbours specifically,
                    # then we don't need to suppress all neighbours' foreground generally here
                    if 'clipped' in region_coords['features'].split(','):
                        ignore = []
                    # go get TextLines
                    self._process_element(region, ignore, region_image,
                                          region_coords, region.id,
                                          file_id + '_' + region.id,
                                          input_file.pageId, zoom)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Ejemplo n.º 22
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names, mask):
        LOG = getLogger('OcrdAnybaseocrBlockSegmenter')
        # check for existing text regions and whether to overwrite them
        border = None
        if page.get_TextRegion():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextRegions in page "%s"', page_id)
                page.set_TextRegion([])
            else:
                LOG.warning('keeping existing TextRegions in page "%s"',
                            page_id)
                return
        # check if border exists
        if page.get_Border():
            border_coords = page.get_Border().get_Coords()
            border_points = polygon_from_points(border_coords.get_points())
            border = Polygon(border_points)


#            page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh)

        img_array = ocrolib.pil2array(page_image)
        page_image.save('./checkthis.png')
        if len(img_array.shape) <= 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        th = self.parameter['th']
        # check for existing semgentation mask
        # this code executes only when use_deeplr is set to True in ocrd-tool.json file
        if mask:
            mask = ocrolib.pil2array(mask)
            mask = mask // 255
            mask = 1 - mask
            # multiply all the bounding box part with 2
            for i in range(len(r['rois'])):

                min_x = r['rois'][i][0]
                min_y = r['rois'][i][1]
                max_x = r['rois'][i][2]
                max_y = r['rois'][i][3]
                mask[min_x:max_x, min_y:max_y] *= i + 2
            cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2)))

            # check for left over pixels and add them to the bounding boxes
            pixel_added = True

            while pixel_added:

                pixel_added = False
                left_over = np.where(mask == 1)
                for x, y in zip(left_over[0], left_over[1]):
                    local_mask = mask[x - th:x + th, y - th:y + th]
                    candidates = np.where(local_mask > 1)
                    candidates = [k for k in zip(candidates[0], candidates[1])]
                    if len(candidates) > 0:
                        pixel_added = True
                        # find closest pixel with x>1
                        candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 +
                                                              (j[1] - th)**2))
                        index = local_mask[candidates[0]] - 2

                        # add pixel to mask/bbox
                        # x,y to bbox with index
                        if x < r['rois'][index][0]:
                            r['rois'][index][0] = x

                        elif x > r['rois'][index][2]:
                            r['rois'][index][2] = x

                        if y < r['rois'][index][1]:
                            r['rois'][index][1] = y

                        elif y > r['rois'][index][3]:
                            r['rois'][index][3] = y

                        # update the mask
                        mask[x, y] = index + 2

        # resolving overlapping problem
        bbox_dict = {}  # to check any overlapping bbox
        class_id_check = []

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            region_bbox = [min_y, min_x, max_y, max_x]

            for key in bbox_dict:
                for bbox in bbox_dict[key]:

                    # checking for ymax case with vertical overlapping
                    # along with y, check both for xmax and xmin
                    if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][2] = bbox[1] - 1

                    # checking for ymin now
                    # along with y, check both for xmax and xmin
                    if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][0] = bbox[3] + 1

            if r['class_ids'][i] not in class_id_check:
                bbox_dict[r['class_ids'][i]] = []
                class_id_check.append(r['class_ids'][i])

            bbox_dict[r['class_ids'][i]].append(region_bbox)

        # resolving overlapping problem code

        # define reading order on basis of coordinates
        reading_order = []

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10
            reading_order.append((min_y, min_x, max_y, max_x))

        reading_order = sorted(reading_order,
                               key=lambda reading_order:
                               (reading_order[1], reading_order[0]))
        for i in range(len(reading_order)):
            min_y, min_x, max_y, max_x = reading_order[i]
            min_y = 0
            i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]])
            for j in range(i + 1, len(reading_order)):
                min_y, min_x, max_y, max_x = reading_order[j]
                j_poly = Polygon([[min_x, min_y], [max_x, min_y],
                                  [max_x, max_y], [min_x, max_y]])
                inter = i_poly.intersection(j_poly)
                if inter:
                    reading_order.insert(j + 1, reading_order[i])
                    del reading_order[i]

        # Creating Reading Order object in PageXML
        order_group = OrderedGroupType(caption="Regions reading order",
                                       id=page_id)

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]
            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]]

            if border:
                cut_region_polygon = border.intersection(
                    Polygon(region_polygon))
                if cut_region_polygon.is_empty:
                    continue
            else:
                cut_region_polygon = Polygon(region_polygon)

            order_index = reading_order.index((min_y, min_x, max_y, max_x))
            region_id = '%s_region%04d' % (page_id, i)
            regionRefIndex = RegionRefIndexedType(index=order_index,
                                                  regionRef=region_id)
            order_group.add_RegionRefIndexed(regionRefIndex)

        reading_order_object = ReadingOrderType()
        reading_order_object.set_OrderedGroup(order_group)
        page.set_ReadingOrder(reading_order_object)

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            # one change here to resolve flipped coordinates
            region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x],
                              [min_y, max_x]]

            cut_region_polygon = border.intersection(Polygon(region_polygon))

            if cut_region_polygon.is_empty:
                continue
            cut_region_polygon = [
                j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]),
                               list(cut_region_polygon.exterior.coords.xy[1]))
            ][:-1]

            # checking whether coordinates are flipped

            region_polygon = coordinates_for_segment(cut_region_polygon,
                                                     page_image, page_xywh)
            region_points = points_from_polygon(region_polygon)

            read_order = reading_order.index((min_y, min_x, max_y, max_x))

            # this can be tested, provided whether we need previous comments or not?
            # resolving overlapping problem

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  # extract from points and img_array

            region_img = ocrolib.array2pil(region_img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                region_img,
                file_id + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)

            # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features'])
            region_id = '%s_region%04d' % (page_id, i)
            coords = CoordsType(region_points)

            # incase of imageRegion
            if r['class_ids'][i] == 15:
                image_region = ImageRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # image_region.add_AlternativeImage(ai)
                page.add_ImageRegion(image_region)
                continue
            if r['class_ids'][i] == 16:
                table_region = TableRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # table_region.add_AlternativeImage(ai)
                page.add_TableRegion(table_region)
                continue
            if r['class_ids'][i] == 17:
                graphic_region = GraphicRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # graphic_region.add_AlternativeImage(ai)
                page.add_GraphicRegion(graphic_region)
                continue

            textregion = TextRegionType(custom='readingOrder {index:' +
                                        str(read_order) + ';}',
                                        id=region_id,
                                        Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            # textregion.add_AlternativeImage(ai)

            #border = page.get_Border()
            # if border:
            #    border.add_TextRegion(textregion)
            # else:
            page.add_TextRegion(textregion)
Ejemplo n.º 23
0
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            img = self.workspace.resolve_image_as_pil(fname)
            param = self.parameter
            base, _ = ocrolib.allsplitext(fname)
            #basefile = ocrolib.allsplitext(os.path.basename(fpath))[0]

            if param['parallel'] < 2:
                print_info("=== %s " % (fname))
            raw = ocrolib.read_image_gray(img.filename)

            flat = raw
            #flat = np.array(binImg)
            # estimate skew angle and rotate
            if param['maxskew'] > 0:
                if param['parallel'] < 2:
                    print_info("estimating skew angle")
                d0, d1 = flat.shape
                o0, o1 = int(param['bignore'] * d0), int(param['bignore'] * d1)
                flat = amax(flat) - flat
                flat -= amin(flat)
                est = flat[o0:d0 - o0, o1:d1 - o1]
                ma = param['maxskew']
                ms = int(2 * param['maxskew'] * param['skewsteps'])
                angle = self.estimate_skew_angle(est,
                                                 linspace(-ma, ma, ms + 1))
                flat = interpolation.rotate(flat,
                                            angle,
                                            mode='constant',
                                            reshape=0)
                flat = amax(flat) - flat
            else:
                angle = 0

            # self.write_angles_to_pageXML(base,angle)
            # estimate low and high thresholds
            if param['parallel'] < 2:
                print_info("estimating thresholds")
            d0, d1 = flat.shape
            o0, o1 = int(param['bignore'] * d0), int(param['bignore'] * d1)
            est = flat[o0:d0 - o0, o1:d1 - o1]
            if param['escale'] > 0:
                # by default, we use only regions that contain
                # significant variance; this makes the percentile
                # based low and high estimates more reliable
                e = param['escale']
                v = est - filters.gaussian_filter(est, e * 20.0)
                v = filters.gaussian_filter(v**2, e * 20.0)**0.5
                v = (v > 0.3 * amax(v))
                v = morphology.binary_dilation(v,
                                               structure=ones(
                                                   (int(e * 50), 1)))
                v = morphology.binary_dilation(v,
                                               structure=ones(
                                                   (1, int(e * 50))))
                if param['debug'] > 0:
                    imshow(v)
                    ginput(1, param['debug'])
                est = est[v]
            lo = stats.scoreatpercentile(est.ravel(), param['lo'])
            hi = stats.scoreatpercentile(est.ravel(), param['hi'])
            # rescale the image to get the gray scale image
            if param['parallel'] < 2:
                print_info("rescaling")
            flat -= lo
            flat /= (hi - lo)
            flat = clip(flat, 0, 1)
            if param['debug'] > 0:
                imshow(flat, vmin=0, vmax=1)
                ginput(1, param['debug'])
            deskewed = 1 * (flat > param['threshold'])

            # output the normalized grayscale and the thresholded images
            print_info("%s lo-hi (%.2f %.2f) angle %4.1f" %
                       (pcgts.get_Page().imageFilename, lo, hi, angle))
            if param['parallel'] < 2:
                print_info("writing")
            ocrolib.write_image_binary(base + ".ds.png", deskewed)

            orientation = TextRegionType(orientation=angle)
            pcgts.get_Page().add_TextRegion(orientation)

            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype="image/png",
                                    url=base + ".ds.png",
                                    local_filename='%s/%s' %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))