Ejemplo n.º 1
0
 def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id):
     """Set the identified page border, if valid."""
     LOG = getLogger('processor.TesserocrCrop')
     left, top, right, bottom = bounds
     if left >= right or top >= bottom:
         LOG.error("Cannot find valid extent for page '%s'", page_id)
         return
     padding = self.parameter['padding']
     # add padding:
     left = max(left - padding, 0)
     right = min(right + padding, page_image.width)
     top = max(top - padding, 0)
     bottom = min(bottom + padding, page_image.height)
     LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom)
     polygon = polygon_from_bbox(left, top, right, bottom)
     polygon = coordinates_for_segment(polygon, page_image, page_xywh)
     polygon = polygon_for_parent(polygon, page)
     if polygon is None:
         LOG.error("Ignoring extant border")
         return
     border = BorderType(Coords=CoordsType(
         points_from_polygon(polygon)))
     # intersection with parent could have changed bbox,
     # so recalculate:
     bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh))
     # update PAGE (annotate border):
     page.set_Border(border)
     # update METS (add the image file):
     page_image = crop_image(page_image, box=bbox)
     page_xywh['features'] += ',cropped'
     file_path = self.workspace.save_image_file(
         page_image, file_id + '.IMG-CROP',
         page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     page.add_AlternativeImage(AlternativeImageType(
         filename=file_path, comments=page_xywh['features']))
Ejemplo n.º 2
0
    def _process_page(self, page, page_image, page_xywh, input_file, zoom=1.0):
        padding = self.parameter['padding']
        img_array = pil2array(page_image)
        # ensure RGB image
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        height, width, _ = img_array.shape
        size = height * width
        # zoom to 300 DPI (larger density: faster; most fixed parameters here expect 300)
        if zoom != 1.0:
            self.logger.info("scaling %dx%d image by %.2f", width, height,
                             zoom)
            img_array = cv2.resize(img_array,
                                   None,
                                   fx=zoom,
                                   fy=zoom,
                                   interpolation=cv2.INTER_CUBIC)

        # detect rule placed in image next to page for scale reference:
        mask_array, mask_box = self.detect_ruler(img_array)
        # detect page frame via line segment detector:
        border_polygon, prefer_border = self.select_borderLine(
            img_array, mask_box)
        border_polygon = np.array(border_polygon) / zoom  # unzoom
        # pad inwards:
        border_polygon = Polygon(border_polygon).buffer(
            -padding).exterior.coords[:-1]
        # get the bounding box from the border polygon:
        # min_x, min_y = border_polygon.min(axis=0)
        # max_x, max_y = border_polygon.max(axis=0)
        # get the inner rectangle from the border polygon:
        # _, min_x, max_x, _ = np.sort(border_polygon[:,0])
        # _, min_y, max_y, _ = np.sort(border_polygon[:,1])
        if prefer_border:
            self.logger.info("Preferring line detector")
        else:
            self.logger.info("Falling back to text detector")
            textboxes = self.detect_textboxes(img_array, mask_array)
            if len(textboxes) > 1:
                textboxes = self.merge_boxes(textboxes, img_array)
            textboxes = np.array(textboxes) / zoom  # unzoom

            if (len(textboxes) == 1 and self.parameter['columnAreaMin'] * size
                    < self.get_area(textboxes[0])):
                self.logger.info("Using text area (%d%% area)",
                                 100 * self.get_area(textboxes[0]) / size)
                min_x, min_y, max_x, max_y = textboxes[0]
                # pad outwards
                border_polygon = polygon_from_bbox(min_x - padding,
                                                   min_y - padding,
                                                   max_x + padding,
                                                   max_y + padding)

        def clip(point):
            x, y = point
            x = max(0, min(page_image.width, x))
            y = max(0, min(page_image.height, y))
            return x, y

        border_polygon = coordinates_for_segment(border_polygon, page_image,
                                                 page_xywh)
        border_polygon = list(map(clip, border_polygon))
        border_points = points_from_polygon(border_polygon)
        border = BorderType(Coords=CoordsType(border_points))
        page.set_Border(border)
        # get clipped relative coordinates for current image
        page_image, page_xywh, _ = self.workspace.image_from_page(
            page, input_file.pageId, fill='background', transparency=True)
        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            page_image,
            file_id + '.IMG-CROP',
            page_id=input_file.pageId,
            file_grp=self.output_file_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Ejemplo n.º 3
0
    def process_lines(self, textlines, maxlevel, region_image, region_coords):
        edits = 0
        lengs = 0
        for line in textlines:
            line_image, line_coords = self.workspace.image_from_segment(
                line, region_image, region_coords)

            self.logger.info("Recognizing text in line '%s'", line.id)
            if line.get_TextEquiv():
                linegt = line.TextEquiv[0].Unicode
            else:
                linegt = ''
            self.logger.debug("GT  '%s': '%s'", line.id, linegt)
            # remove existing annotation below line level:
            line.set_TextEquiv([])
            line.set_Word([])

            if line_image.size[1] < 16:
                self.logger.debug(
                    "ERROR: bounding box is too narrow at line %s", line.id)
                continue
            # resize image to 48 pixel height
            final_img, scale = resize_keep_ratio(line_image)

            # process ocropy:
            try:
                linepred, clist, rlist, confidlist = recognize(final_img,
                                                               self.pad,
                                                               self.network,
                                                               check=True)
            except Exception as err:
                self.logger.debug('error processing line "%s": %s', line.id,
                                  err)
                continue
            self.logger.debug("OCR '%s': '%s'", line.id, linepred)
            edits += Levenshtein.distance(linepred, linegt)
            lengs += len(linegt)

            words = [x.strip() for x in linepred.split(' ') if x.strip()]

            word_r_list = [[0]]  # r-positions of every glyph in every word
            word_conf_list = [[]]  # confidences of every glyph in every word
            if words != []:
                w_no = 0
                found_char = False
                for i, c in enumerate(clist):
                    if c != ' ':
                        found_char = True
                        word_conf_list[w_no].append(confidlist[i])
                        word_r_list[w_no].append(rlist[i])

                    if c == ' ' and found_char:
                        if i == 0:
                            word_r_list[0][0] = rlist[i]

                        elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ':
                            word_conf_list.append([])
                            word_r_list.append([rlist[i]])
                            w_no += 1
            else:
                word_conf_list = [[0]]
                word_r_list = [[0, line_image.width]]

            # conf for each word
            wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list]
            # conf for the line
            line_conf = (min(wordsconf) + max(wordsconf)) / 2
            # line text
            line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))

            if maxlevel in ['word', 'glyph']:
                for word_no, word_str in enumerate(words):
                    word_points = points_from_polygon(
                        coordinates_for_segment(
                            np.array(
                                polygon_from_bbox(
                                    word_r_list[word_no][0] / scale, 0,
                                    word_r_list[word_no][-1] / scale,
                                    0 + line_image.height)), line_image,
                            line_coords))
                    word_id = '%s_word%04d' % (line.id, word_no)
                    word = WordType(id=word_id, Coords=CoordsType(word_points))
                    line.add_Word(word)
                    word.add_TextEquiv(
                        TextEquivType(Unicode=word_str,
                                      conf=wordsconf[word_no]))

                    if maxlevel == 'glyph':
                        for glyph_no, glyph_str in enumerate(word_str):
                            glyph_points = points_from_polygon(
                                coordinates_for_segment(
                                    np.array(
                                        polygon_from_bbox(
                                            word_r_list[word_no][glyph_no] /
                                            scale, 0,
                                            word_r_list[word_no][glyph_no + 1]
                                            / scale, 0 + line_image.height)),
                                    line_image, line_coords))
                            glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
                            glyph = GlyphType(id=glyph_id,
                                              Coords=CoordsType(glyph_points))
                            word.add_Glyph(glyph)
                            glyph.add_TextEquiv(
                                TextEquivType(
                                    Unicode=glyph_str,
                                    conf=word_conf_list[word_no][glyph_no]))
        return edits, lengs
Ejemplo n.º 4
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Ejemplo n.º 5
0
    def process(self):
        """Extract page images and region descriptions (type and coordinates) from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Get all regions with their types (region element class), sub-types (@type)
        and coordinates relative to the page (which depending on the workflow could
        already be cropped, deskewed, dewarped, binarized etc). Extract the image of
        the (cropped, deskewed, dewarped) page, both in binarized form (if available)
        and raw form. For the latter, apply ``feature_filter`` (a comma-separated list
        of image features, cf. :py:func:`ocrd.workspace.Workspace.image_from_page`)
        to skip specific features when retrieving derived images. If ``transparency``
        is true, then also add an alpha channel which is fully transparent outside of
        the mask.
        
        In addition, create a new (third) image with masks for each segment type in
        ``plot_segmasks``, color-coded by class according to ``colordict``.
        
        Create two JSON files with region types and coordinates: one (page-wise) in
        our custom format and one (global) in MS-COCO.
        
        The output file group may be given as a comma-separated list to separate
        these 3 kinds of images. If fewer than 3 fileGrps are specified, they will
        share the same fileGrp (and directory). In particular, write files as follows:
        * in the first (or only) output file group (directory):
          - ID + '.png': raw image of the page (preprocessed, but with ``feature_filter``)
          - ID + '.json': region coordinates/classes (custom format)
        * in the second (or only) output file group (directory):
          - ID + '.bin.png': binarized image of the (preprocessed) page, if available
        * in the third (or second or only) output file group (directory):
          - ID + '.pseg.png': mask image of page; contents depend on ``plot_segmasks``:
            1. if it contains `page`, fill page frame,
            2. if it contains `region`, fill region segmentation/classification,
            3. if it contains `line`, fill text line segmentation,
            4. if it contains `word`, fill word segmentation,
            5. if it contains `glyph`, fill glyph segmentation,
            where each follow-up layer and segment draws over the previous state, starting
            with a blank (white) image - unless ``plot_overlay`` is true, in which case
            each layer and segment is superimposed (alpha blended) onto the previous one,
            starting with the above raw image.
        
        In addition, write a file for all pages at once:
        * in the third (or first) output file group (directory):
          - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format)
          - output_file_grp + '.colordict.json': the used ``colordict``
        
        (This is intended for training and evaluation of region segmentation models.)
        """
        LOG = getLogger('processor.ExtractPages')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        file_groups = self.output_file_grp.split(',')
        if len(file_groups) > 3:
            raise Exception(
                "at most 3 output file grps allowed (raw, [binarized, [mask]] image)"
            )
        if len(file_groups) > 2:
            mask_image_grp = file_groups[2]
        else:
            mask_image_grp = file_groups[0]
            LOG.info(
                "No output file group for mask images specified, falling back to output filegrp '%s'",
                mask_image_grp)
        if len(file_groups) > 1:
            bin_image_grp = file_groups[1]
        else:
            bin_image_grp = file_groups[0]
            LOG.info(
                "No output file group for binarized images specified, falling back to output filegrp '%s'",
                bin_image_grp)
        self.output_file_grp = file_groups[0]
        classes = self.parameter['colordict']

        # COCO: init data structures
        images = list()
        annotations = list()
        categories = list()
        i = 0
        for cat, color in classes.items():
            # COCO format does not allow alpha channel
            color = (int(color[0:2], 16), int(color[2:4],
                                              16), int(color[4:6], 16))
            try:
                supercat, name = cat.split(':')
            except ValueError:
                name = cat
                supercat = ''
            categories.append({
                'id': i,
                'name': name,
                'supercategory': supercat,
                'source': 'PAGE',
                'color': color
            })
            i += 1

        i = 0
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            try:
                # separate non-numeric part of page ID to retain the numeric part
                num_page_id = int(page_id.strip(page_id.strip("0123456789")))
            except Exception:
                num_page_id = n
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            ptype = page.get_type()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=self.parameter['feature_filter'],
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                page_image,
                file_id,
                self.output_file_grp,
                page_id=page_id,
                mimetype=self.parameter['mimetype'])
            try:
                page_image_bin, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='binarized',
                    transparency=self.parameter['transparency'])
                self.workspace.save_image_file(page_image_bin,
                                               file_id + '.bin',
                                               bin_image_grp,
                                               page_id=page_id)
            except Exception as err:
                if err.args[0].startswith('Found no AlternativeImage'):
                    LOG.warning(
                        'Page "%s" has no binarized images, skipping .bin',
                        page_id)
                else:
                    raise
            # init multi-level mask output
            if self.parameter['plot_overlay']:
                page_image_segmask = page_image.convert('RGBA')
            else:
                page_image_segmask = Image.new(mode='RGBA',
                                               size=page_image.size,
                                               color='#FFFFFF00')
            neighbors = dict()
            for level in ['page', 'region', 'line', 'word', 'glyph']:
                neighbors[level] = list()
            # produce border mask plot, if necessary
            if page.get_Border():
                poly = segment_poly(page_id, page.get_Border(), page_coords)
            else:
                poly = Polygon(
                    polygon_from_bbox(0, 0, page_image.width,
                                      page_image.height))
            if 'page' in self.parameter['plot_segmasks']:
                plot_segment(page_id, page.get_Border(), poly, 'Border',
                             classes, page_image_segmask, [],
                             self.parameter['plot_overlay'])
            # get regions and aggregate masks on all hierarchy levels
            description = {'angle': page.get_orientation()}
            regions = dict()
            for name in classes.keys():
                if not name or not name.endswith('Region'):
                    # no region subtypes or non-region types here
                    continue
                #regions[name] = getattr(page, 'get_' + name)()
                regions[name] = page.get_AllRegions(classes=name[:-6],
                                                    order='reading-order')
            for rtype, rlist in regions.items():
                for region in rlist:
                    if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']:
                        subrtype = region.get_type()
                    else:
                        subrtype = None
                    if subrtype:
                        rtype0 = rtype + ':' + subrtype
                    else:
                        rtype0 = rtype
                    poly = segment_poly(page_id, region, page_coords)
                    # produce region mask plot, if necessary
                    if 'region' in self.parameter['plot_segmasks']:
                        plot_segment(page_id, region, poly, rtype0, classes,
                                     page_image_segmask, neighbors['region'],
                                     self.parameter['plot_overlay'])
                    if rtype == 'TextRegion':
                        lines = region.get_TextLine()
                        for line in lines:
                            # produce line mask plot, if necessary
                            poly = segment_poly(page_id, line, page_coords)
                            if 'line' in self.parameter['plot_segmasks']:
                                plot_segment(page_id, line, poly, 'TextLine',
                                             classes, page_image_segmask,
                                             neighbors['line'],
                                             self.parameter['plot_overlay'])
                            words = line.get_Word()
                            for word in words:
                                # produce line mask plot, if necessary
                                poly = segment_poly(page_id, word, page_coords)
                                if 'word' in self.parameter['plot_segmasks']:
                                    plot_segment(
                                        page_id, word, poly, 'Word', classes,
                                        page_image_segmask, neighbors['word'],
                                        self.parameter['plot_overlay'])
                                glyphs = word.get_Glyph()
                                for glyph in glyphs:
                                    # produce line mask plot, if necessary
                                    poly = segment_poly(
                                        page_id, glyph, page_coords)
                                    if 'glyph' in self.parameter[
                                            'plot_segmasks']:
                                        plot_segment(
                                            page_id, glyph, poly, 'Glyph',
                                            classes, page_image_segmask,
                                            neighbors['glyph'],
                                            self.parameter['plot_overlay'])
                    if not poly:
                        continue
                    polygon = np.array(poly.exterior, np.int)[:-1].tolist()
                    xywh = xywh_from_polygon(polygon)
                    area = poly.area
                    description.setdefault('regions', []).append({
                        'type':
                        rtype,
                        'subtype':
                        subrtype,
                        'coords':
                        polygon,
                        'area':
                        area,
                        'features':
                        page_coords['features'],
                        'DPI':
                        dpi,
                        'region.ID':
                        region.id,
                        'page.ID':
                        page_id,
                        'page.type':
                        ptype,
                        'file_grp':
                        self.input_file_grp,
                        'METS.UID':
                        self.workspace.mets.unique_identifier
                    })
                    # COCO: add annotations
                    i += 1
                    annotations.append({
                        'id':
                        i,
                        'image_id':
                        num_page_id,
                        'category_id':
                        next(
                            (cat['id']
                             for cat in categories if cat['name'] == subrtype),
                            next((cat['id'] for cat in categories
                                  if cat['name'] == rtype))),
                        'segmentation':
                        np.array(poly.exterior,
                                 np.int)[:-1].reshape(1, -1).tolist(),
                        'area':
                        area,
                        'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']],
                        'iscrowd':
                        0
                    })

            if 'order' in self.parameter['plot_segmasks']:
                plot_order(page.get_ReadingOrder(), classes,
                           page_image_segmask, neighbors['region'],
                           self.parameter['plot_overlay'])
            if self.parameter['plot_segmasks']:
                self.workspace.save_image_file(
                    page_image_segmask,
                    file_id + '.pseg',
                    mask_image_grp,
                    page_id=page_id,
                    mimetype=self.parameter['mimetype'])
            self.workspace.add_file(
                ID=file_id + '.json',
                file_grp=mask_image_grp,
                pageId=input_file.pageId,
                local_filename=file_path.replace(
                    MIME_TO_EXT[self.parameter['mimetype']], '.json'),
                mimetype='application/json',
                content=json.dumps(description))

            # COCO: add image
            images.append({
                # COCO does not allow string identifiers:
                # -> use numerical part of page_id
                'id': num_page_id,
                # all exported coordinates are relative to the cropped page:
                # -> use that for reference (instead of original page.imageFilename)
                'file_name': file_path,
                # -> use its size (instead of original page.imageWidth/page.imageHeight)
                'width': page_image.width,
                'height': page_image.height
            })

        # COCO: write result
        file_id = mask_image_grp + '.coco.json'
        LOG.info('Writing COCO result file "%s" in "%s"', file_id,
                 mask_image_grp)
        self.workspace.add_file(ID=file_id,
                                file_grp=mask_image_grp,
                                local_filename=os.path.join(
                                    mask_image_grp, file_id),
                                mimetype='application/json',
                                pageId=None,
                                content=json.dumps({
                                    'categories': categories,
                                    'images': images,
                                    'annotations': annotations
                                }))

        # write inverse colordict (for ocrd-segment-from-masks)
        file_id = mask_image_grp + '.colordict.json'
        LOG.info('Writing colordict file "%s" in .', file_id)
        with open(file_id, 'w') as out:
            json.dump(
                dict((col, name) for name, col in classes.items() if name),
                out)
def polygon_from_segment(segment: RectSegment):
    from ocrd_utils import polygon_from_bbox
    return polygon_from_bbox(segment.y_start, segment.x_start, segment.y_end,
                             segment.x_end)