コード例 #1
0
ファイル: test_utils.py プロジェクト: b2m/core
    def test_membername(self):
        class Klazz:
            def __init__(self):
                self.prop = 42

        instance = Klazz()
        self.assertEqual(membername(instance, 42), 'prop')
コード例 #2
0
 def _process_region(self, it, region, rogroup, region_image,
                     region_coords):
     LOG = getLogger('processor.TesserocrSegmentTable')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     index = 0
     if rogroup:
         for elem in (rogroup.get_RegionRefIndexed() +
                      rogroup.get_OrderedGroupIndexed() +
                      rogroup.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     while it and not it.Empty(RIL.BLOCK):
         bbox = it.BoundingBox(RIL.BLOCK)
         polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, region_image,
                                           region_coords)
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (but ignore non-text regions entirely)
         ID = region.id + "_%04d" % index
         subregion = TextRegionType(id=ID,
                                    Coords=coords,
                                    type=TextTypeSimpleType.PARAGRAPH)
         block_type = it.BlockType()
         if block_type == PT.FLOWING_TEXT:
             pass
         elif block_type == PT.HEADING_TEXT:
             subregion.set_type(TextTypeSimpleType.HEADING)
         elif block_type == PT.PULLOUT_TEXT:
             subregion.set_type(TextTypeSimpleType.FLOATING)
         elif block_type == PT.CAPTION_TEXT:
             subregion.set_type(TextTypeSimpleType.CAPTION)
         elif block_type == PT.VERTICAL_TEXT:
             subregion.set_orientation(90.0)
         else:
             it.Next(RIL.BLOCK)
             continue
         LOG.info("Detected cell '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         region.add_TextRegion(subregion)
         if rogroup:
             rogroup.add_RegionRefIndexed(
                 RegionRefIndexedType(regionRef=ID, index=index))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
コード例 #3
0
 def _process_segment(self, tessapi, segment, image, xywh, where, page_id,
                      file_id):
     LOG = getLogger('processor.TesserocrDeskew')
     if not image.width or not image.height:
         LOG.warning("Skipping %s with zero size", where)
         return
     angle0 = xywh[
         'angle']  # deskewing (w.r.t. top image) already applied to image
     angle = 0.  # additional angle to be applied at current level
     tessapi.SetImage(image)
     #tessapi.SetPageSegMode(PSM.AUTO_OSD)
     #
     # orientation/script
     #
     osr = tessapi.DetectOrientationScript()
     if osr:
         assert not math.isnan(osr['orient_conf']), \
             "orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)"
         if osr['orient_conf'] < self.parameter[
                 'min_orientation_confidence']:
             LOG.info(
                 'ignoring OSD orientation result %d° clockwise due to low confidence %.0f in %s',
                 osr['orient_deg'], osr['orient_conf'], where)
         else:
             LOG.info(
                 'applying OSD orientation result %d° clockwise with high confidence %.0f in %s',
                 osr['orient_deg'], osr['orient_conf'], where)
             # defined as 'the detected clockwise rotation of the input image'
             # i.e. the same amount to be applied counter-clockwise for deskewing:
             angle = osr['orient_deg']
         assert not math.isnan(osr['script_conf']), \
             "script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)"
         if osr['script_conf'] < 10:
             LOG.info(
                 'ignoring OSD script result "%s" due to low confidence %.0f in %s',
                 osr['script_name'], osr['script_conf'], where)
         else:
             LOG.info(
                 'applying OSD script result "%s" with high confidence %.0f in %s',
                 osr['script_name'], osr['script_conf'], where)
             if isinstance(segment, (TextRegionType, PageType)):
                 segment.set_primaryScript({
                     "Arabic": "Arab - Arabic",
                     "Armenian": "Armn - Armenian",
                     "Bengali": "Armn - Armenian",
                     "Canadian_Aboriginal":
                     "Cans - Unified Canadian Aboriginal Syllabics",
                     "Cherokee": "Cher - Cherokee",
                     "Common": "Latn - Latin",  # not in scripts/
                     "Cyrillic": "Cyrl - Cyrillic",
                     "Devanagari": "Deva - Devanagari (Nagari)",
                     "Ethiopic": "Ethi - Ethiopic",
                     "Fraktur": "Latf - Latin (Fraktur variant)",
                     "Georgian": "Geor - Georgian (Mkhedruli)",
                     "Greek": "Grek - Greek",
                     "Gujarati": "Gujr - Gujarati",
                     "Gurmukhi": "Guru - Gurmukhi",
                     "Han":
                     "Hant - Han (Traditional variant)",  # not in scripts/
                     "Hangul": "Hang - Hangul",
                     "Hangul_vert": "Hang - Hangul",
                     "HanS": "Hans - Han (Simplified variant)",
                     "HanS_vert": "Hans - Han (Simplified variant)",
                     "HanT": "Hant - Han (Traditional variant)",
                     "HanT_vert": "Hant - Han (Traditional variant)",
                     "Hebrew": "Hebr - Hebrew",
                     "Hiragana": "Jpan - Japanese",  # not in scripts/
                     "Japanese": "Jpan - Japanese",
                     "Japanese_vert": "Jpan - Japanese",
                     "Kannada": "Knda - Kannada",
                     "Katakana": "Jpan - Japanese",  # not in scripts/
                     "Khmer": "Khmr - Khmer",
                     "Lao": "Laoo - Lao",
                     "Latin": "Latn - Latin",
                     "Malayalam": "Mlym - Malayalam",
                     "Myanmar": "Mymr - Myanmar (Burmese)",
                     "Oriya": "Orya - Oriya",
                     "Sinhala": "Sinh - Sinhala",
                     "Syriac": "Syrc - Syriac",
                     "Tamil": "Taml - Tamil",
                     "Telugu": "Telu - Telugu",
                     "Thaana": "Thaa - Thaana",
                     "Thai": "Thai - Thai",
                     "Tibetan": "Tibt - Tibetan",
                     "Vietnamese": "Tavt - Tai Viet",
                 }.get(osr['script_name'], "Latn - Latin"))
     else:
         LOG.warning('no OSD result in %s', where)
     #
     # orientation/skew
     #
     layout = tessapi.AnalyseLayout()
     if not layout:
         LOG.warning('no result iterator in %s', where)
         return
     orientation, writing_direction, textline_order, deskew_angle = layout.Orientation(
     )
     if isinstance(segment, (TextRegionType, PageType)):
         segment.set_readingDirection({
             WritingDirection.LEFT_TO_RIGHT:
             'left-to-right',
             WritingDirection.RIGHT_TO_LEFT:
             'right-to-left',
             WritingDirection.TOP_TO_BOTTOM:
             'top-to-bottom'
         }.get(writing_direction, 'bottom-to-top'))
         segment.set_textLineOrder({
             TextlineOrder.LEFT_TO_RIGHT:
             'left-to-right',
             TextlineOrder.RIGHT_TO_LEFT:
             'right-to-left',
             TextlineOrder.TOP_TO_BOTTOM:
             'top-to-bottom'
         }.get(textline_order, 'bottom-to-top'))
     # baseline = layout.Baseline(RIL.BLOCK)
     # if baseline:
     #     points = points_from_x0y0x1y1(list(baseline[0]) + list(baseline[1]))
     #     segment.add_Baseline(BaselineType(points=points))
     # defined as 'how many radians does one have to rotate the block anti-clockwise'
     # i.e. positive amount to be applied counter-clockwise for deskewing:
     deskew_angle *= 180 / math.pi
     LOG.info('orientation/deskewing for %s: %s / %s / %s / %.3f°', where,
              membername(Orientation, orientation),
              membername(WritingDirection, writing_direction),
              membername(TextlineOrder, textline_order), deskew_angle)
     # defined as 'the amount of clockwise rotation to be applied to the input image'
     # i.e. the negative amount to be applied counter-clockwise for deskewing:
     # (as defined in Tesseract OrientationIdToValue):
     angle2 = {
         Orientation.PAGE_RIGHT: 90,
         Orientation.PAGE_DOWN: 180,
         Orientation.PAGE_LEFT: 270
     }.get(orientation, 0)
     if angle2 != angle:
         # This effectively ignores Orientation from AnalyseLayout,
         # because it is usually wrong when it deviates from OSD results.
         # (We do keep deskew_angle, though – see below.)
         LOG.warning(
             'inconsistent angles from layout analysis (%d) and orientation detection (%d) in %s',
             angle2, angle, where)
     # annotate result:
     angle += deskew_angle
     # page angle: PAGE @orientation is defined clockwise,
     # whereas PIL/ndimage rotation is in mathematical direction:
     orientation = -(angle + angle0)
     orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
     segment.set_orientation(
         orientation)  # also removes all deskewed AlternativeImages
     # Tesseract layout analysis already rotates the image, even for each
     # sub-segment (depending on RIL), but the accuracy is not as good
     # as setting the image to the sub-segments and running without iterator.
     # (These images can be queried via GetBinaryImage/GetImage, cf. segment_region)
     # Unfortunately, it does _not_ use expand=True, but chops off corners.
     # So we must do it here from the original image ourselves.
     # We can delegate to OCR-D core for reflection, deskewing and re-cropping:
     if isinstance(segment, PageType):
         image, xywh, _ = self.workspace.image_from_page(segment,
                                                         page_id,
                                                         fill='background',
                                                         transparency=True)
     else:
         image, xywh = self.workspace.image_from_segment(segment,
                                                         image,
                                                         xywh,
                                                         fill='background',
                                                         transparency=True)
     if not angle:
         # zero rotation does not change coordinates,
         # but assures consuming processors that the
         # workflow had deskewing
         xywh['features'] += ',deskewed'
     features = xywh['features']  # features already applied to image
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(
         image,
         file_id + '.IMG-DESKEW',
         page_id=page_id,
         file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     segment.add_AlternativeImage(
         AlternativeImageType(filename=file_path, comments=features))
コード例 #4
0
    def process(self):
        """Performs region segmentation by reading mask images in pseudo-colour.
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group, as well as mask image file from the second.
        
        Then iterate over all connected (equally colored) mask segments and compute
        convex hull contours for them. Convert them to polygons, and look up their
        color value in ``colordict`` to instantiate the appropriate region types
        (optionally with subtype). Instantiate and annotate regions accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        colordict = self.parameter['colordict']
        if not colordict:
            LOG.info('Using default PAGE colordict')
            colordict = dict(('#' + col, name)
                             for name, col in CLASSES.items()
                             if name)
        typedict = {"TextRegion": TextTypeSimpleType,
                    "GraphicRegion": GraphicsTypeSimpleType,
                    "ChartType": ChartTypeSimpleType}
        ifgs = self.input_file_grp.split(",") # input file groups
        if len(ifgs) != 2:
            raise Exception("need 2 input file groups (base and mask)")
        # collect input file tuples
        ifts = self.zip_input_files(ifgs) # input file tuples
        # process input file tuples
        for n, ift in enumerate(ifts):
            input_file, segmentation_file = ift
            LOG.info("processing page %s", input_file.pageId)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()

            # add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))

            # import mask image
            segmentation_filename = self.workspace.download_file(segmentation_file).local_filename
            with pushd_popd(self.workspace.directory):
                segmentation_pil = Image.open(segmentation_filename)
            has_alpha = segmentation_pil.mode == 'RGBA'
            if has_alpha:
                colorformat = "#%08X"
            else:
                colorformat = "#%06X"
                if segmentation_pil.mode != 'RGB':
                    segmentation_pil = segmentation_pil.convert('RGB')
            # convert to array
            segmentation_array = np.array(segmentation_pil)
            # collapse 3 color channels
            segmentation_array = segmentation_array.dot(
                np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:])
            # partition mapped colors vs background
            colors = np.unique(segmentation_array)
            bgcolors = []
            for i, color in enumerate(colors):
                colorname = colorformat % color
                if (colorname not in colordict or
                    not colordict[colorname]):
                    #raise Exception("Unknown color %s (not in colordict)" % colorname)
                    LOG.info("Ignoring background color %s", colorname)
                    bgcolors.append(i)
            background = np.zeros_like(segmentation_array, np.uint8)
            if bgcolors:
                for i in bgcolors:
                    background += np.array(segmentation_array == colors[i], np.uint8)
                colors = np.delete(colors, bgcolors, 0)
            # iterate over mask for each mapped color/class
            regionno = 0
            for color in colors:
                # get region (sub)type
                colorname = colorformat % color
                classname = colordict[colorname]
                regiontype = None
                custom = None
                if ":" in classname:
                    classname, regiontype = classname.split(":")
                    if classname in typedict:
                        typename = membername(typedict[classname], regiontype)
                        if typename == regiontype:
                            # not predefined in PAGE: use other + custom
                            custom = "subtype:%s" % regiontype
                            regiontype = "other"
                    else:
                        custom = "subtype:%s" % regiontype
                if classname + "Type" not in globals():
                    raise Exception("Unknown class '%s' for color %s in colordict" % (classname, colorname))
                classtype = globals()[classname + "Type"]
                if classtype is BorderType:
                    # mask from all non-background regions
                    classmask = 1 - background
                else:
                    # mask from current color/class
                    classmask = np.array(segmentation_array == color, np.uint8)
                if not np.count_nonzero(classmask):
                    continue
                # now get the contours and make polygons for them
                contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                for contour in contours:
                    # (could also just take bounding boxes to avoid islands/inclusions...)
                    area = cv2.contourArea(contour)
                    # filter too small regions
                    area_pct = area / np.prod(segmentation_array.shape) * 100
                    if area < 100 and area_pct < 0.1:
                        LOG.warning('ignoring contour of only %.1f%% area for %s',
                                    area_pct, classname)
                        continue
                    LOG.info('found region %s:%s:%s with area %.1f%%',
                             classname, regiontype or '', custom or '', area_pct)
                    # simplify shape
                    poly = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
                    if len(poly) < 4:
                        LOG.warning('ignoring contour of only %d points (area %.1f%%) for %s',
                                    len(poly), area_pct, classname)
                        continue
                    if classtype is BorderType:
                        # add Border
                        page.set_Border(BorderType(Coords=CoordsType(points=points_from_polygon(poly))))
                        break
                    else:
                        # instantiate region
                        regionno += 1
                        region = classtype(id="region_%d" % regionno, type_=regiontype, custom=custom,
                                           Coords=CoordsType(points=points_from_polygon(poly)))
                        # add region
                        getattr(page, 'add_%s' % classname)(region)
                    
            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(ifgs[0], self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                content=to_xml(pcgts))
コード例 #5
0
 def _process_page(self, it, page, page_image, page_coords, page_id):
     LOG = getLogger('processor.TesserocrSegmentRegion')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     # and its BlockPolygon()
     index = 0
     ro = page.get_ReadingOrder()
     if not ro:
         ro = ReadingOrderType()
         page.set_ReadingOrder(ro)
     og = ro.get_OrderedGroup()
     if og:
         # start counting from largest existing index
         for elem in (og.get_RegionRefIndexed() +
                      og.get_OrderedGroupIndexed() +
                      og.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     else:
         # new top-level group
         og = OrderedGroupType(id="reading-order")
         ro.set_OrderedGroup(og)
     while it and not it.Empty(RIL.BLOCK):
         # (padding will be passed to both BoundingBox and GetImage)
         # (actually, Tesseract honours padding only on the left and bottom,
         #  whereas right and top are increased less!)
         bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
         # sometimes these polygons are not planar, which causes
         # PIL.ImageDraw.Draw.polygon (and likely others as well)
         # to misbehave; however, PAGE coordinate semantics prohibit
         # multi-path polygons!
         # (probably a bug in Tesseract itself, cf. tesseract#2826):
         if self.parameter['crop_polygons']:
             polygon = it.BlockPolygon()
         else:
             polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, page_image, page_coords)
         polygon2 = polygon_for_parent(polygon, page)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         if polygon2 is None:
             LOG.info('Ignoring extant region: %s', points)
             it.Next(RIL.BLOCK)
             continue
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         # region_image_bin = it.GetBinaryImage(RIL.BLOCK)
         # if not region_image_bin.getbbox():
         #     LOG.info('Ignoring binary-empty region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (will be removed again if Separator/Noise region below)
         ID = "region%04d" % index
         og.add_RegionRefIndexed(
             RegionRefIndexedType(regionRef=ID, index=index))
         #
         # region type switch
         #
         block_type = it.BlockType()
         if block_type in [
                 PT.FLOWING_TEXT,
                 PT.HEADING_TEXT,
                 PT.PULLOUT_TEXT,
                 PT.CAPTION_TEXT,
                 # TABLE is contained in PTIsTextType, but
                 # it is a bad idea to create a TextRegion
                 # for it (better set `find_tables` False):
                 # PT.TABLE,
                 # will also get a 90° @orientation
                 # (but that can be overridden by deskew/OSD):
                 PT.VERTICAL_TEXT
         ]:
             region = TextRegionType(id=ID,
                                     Coords=coords,
                                     type=TextTypeSimpleType.PARAGRAPH)
             if block_type == PT.VERTICAL_TEXT:
                 region.set_orientation(90.0)
             elif block_type == PT.HEADING_TEXT:
                 region.set_type(TextTypeSimpleType.HEADING)
             elif block_type == PT.PULLOUT_TEXT:
                 region.set_type(TextTypeSimpleType.FLOATING)
             elif block_type == PT.CAPTION_TEXT:
                 region.set_type(TextTypeSimpleType.CAPTION)
             page.add_TextRegion(region)
             if self.parameter['sparse_text']:
                 region.set_type(TextTypeSimpleType.OTHER)
                 region.add_TextLine(
                     TextLineType(id=region.id + '_line', Coords=coords))
         elif block_type in [
                 PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE
         ]:
             region = ImageRegionType(id=ID, Coords=coords)
             page.add_ImageRegion(region)
         elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]:
             region = SeparatorRegionType(id=ID, Coords=coords)
             page.add_SeparatorRegion(region)
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]:
             region = MathsRegionType(id=ID, Coords=coords)
             page.add_MathsRegion(region)
         elif block_type == PT.TABLE:
             # without API access to StructuredTable we cannot
             # do much for a TableRegionType (i.e. nrows, ncols,
             # coordinates of cells for recursive regions etc),
             # but this can be achieved afterwards by segment-table
             region = TableRegionType(id=ID, Coords=coords)
             page.add_TableRegion(region)
         else:
             region = NoiseRegionType(id=ID, Coords=coords)
             page.add_NoiseRegion()
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         LOG.info("Detected region '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
     if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed()
             and not og.get_UnorderedGroupIndexed()):
         # schema forbids empty OrderedGroup
         ro.set_OrderedGroup(None)
コード例 #6
0
    def process(self):
        """Performs region segmentation by reading from COCO annotations.
        
        Open and deserialize the COCO JSON file from the second input file group.
        (It lists region categories/subtypes, file names and segmentations for all pages.)
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group. Now find this page in COCO:
        - try to match the PAGE ``imageFilename`` or METS file path matches to some
          COCO ``file_name``, otherwise
        - try to match the numeric part of the METS physical page ID to some
          COCO ``id``, otherwise
        - skip with an error.
        
        Then create and add a region for each ``segmentation``, converting its polygon
        to coordinate points and its COCO category to a region type (and subtype),
        either for a PubLayNet classification or PAGE classification (as produced by
        ocrd-segment-extract-pages), as indicated by ``source``.
        
        Produce a new output file by serialising the resulting hierarchy.
        
        Afterwards, if there are still COCO images left unaccounted for (i.e. without
        corresponding input files), then show a warning.
        """
        LOG = getLogger('processor.ImportCOCOSegmentation')
        # Load JSON
        assert_file_grp_cardinality(self.input_file_grp, 2, 'base and COCO')
        # pylint: disable=attribute-defined-outside-init
        self.input_file_grp, coco_grp = self.input_file_grp.split(',')
        # pylint: disable=attribute-defined-outside-init
        if not self.input_files:
            LOG.warning('No input files to process')
            return
        if coco_grp in self.workspace.mets.file_groups:
            try:
                cocofile = next(
                    f for f in self.workspace.mets.find_files(fileGrp=coco_grp)
                    # if f.mimetype == 'application/json' and not f.pageId
                    if not f.pageId)
            except StopIteration:
                raise Exception(
                    "no non-page-specific file in second file group (COCO file)",
                    coco_grp)
            cocofile = self.workspace.download_file(cocofile).local_filename
        elif os.path.isfile(coco_grp):
            cocofile = coco_grp
        else:
            raise Exception("file not found in second file group (COCO file)",
                            coco_grp)

        LOG.info('Loading COCO annotations from "%s" into memory...', cocofile)
        with open(cocofile, 'r') as inp:
            coco = json.load(inp)
        LOG.info('Loaded JSON for %d images with %d regions in %d categories',
                 len(coco['images']), len(coco['annotations']),
                 len(coco['categories']))
        coco_source = 'PubLayNet'
        # Convert to usable dicts
        # classes:
        categories = dict()
        subcategories = dict()
        for cat in coco['categories']:
            if cat['source'] == 'PAGE':
                coco_source = 'PAGE'
            if 'supercategory' in cat and cat['supercategory']:
                categories[cat['id']] = cat['supercategory']
                subcategories[cat['id']] = cat['name']
            else:
                categories[cat['id']] = cat['name']
        # images and annotations:
        images_by_id = dict()
        images_by_filename = dict()
        for image in coco['images']:
            images_by_id[image['id']] = image
            images_by_filename[image['file_name']] = image
        for annotation in coco['annotations']:
            image = images_by_id[annotation['image_id']]
            regions = image.setdefault('regions', list())
            regions.append(annotation)
        del coco

        LOG.info('Converting %s annotations into PAGE-XML', coco_source)
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()

            # find COCO image
            if page.imageFilename in images_by_filename:
                image = images_by_filename[page.imageFilename]
            elif num_page_id in images_by_id:
                image = images_by_id[num_page_id]
            else:
                LOG.error('Page "%s" / file "%s" not found in COCO', page_id,
                          page.imageFilename)
                # todo: maybe we should at least write the (unchanged) output PAGE?
                continue
            if image['width'] != page.imageWidth:
                LOG.error(
                    'Page "%s" width %d does not match annotated width %d',
                    page_id, page.imageWidth, image['width'])
            if image['height'] != page.imageHeight:
                LOG.error(
                    'Page "%s" height %d does not match annotated height %d',
                    page_id, page.imageHeight, image['height'])

            # todo: remove existing segmentation first?
            for region in image['regions']:
                assert isinstance(
                    region['segmentation'],
                    list), "importing RLE/mask segmentation not implemented"
                polygon = np.array(region['segmentation'])
                polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2))
                coords = CoordsType(points=points_from_polygon(polygon))
                category = categories[region['category_id']]
                if region['category_id'] in subcategories:
                    subcategory = subcategories[region['category_id']]
                else:
                    subcategory = None
                region_id = 'r' + str(region['id'])
                LOG.info('Adding region %s:%s [area %d]', category, subcategory
                         or '', region['area'])
                if coco_source == 'PubLayNet':
                    if category == 'text':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.PARAGRAPH)
                        page.add_TextRegion(region_obj)
                    elif category == 'title':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.HEADING)  # CAPTION?
                        page.add_TextRegion(region_obj)
                    elif category == 'list':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.LISTLABEL)  # OTHER?
                        page.add_TextRegion(region_obj)
                    elif category == 'table':
                        region_obj = TableRegionType(id=region_id,
                                                     Coords=coords)
                        page.add_TableRegion(region_obj)
                    elif category == 'figure':
                        region_obj = ImageRegionType(id=region_id,
                                                     Coords=coords)
                        page.add_ImageRegion(region_obj)
                    else:
                        raise Exception('unknown region category: %s' %
                                        category)
                else:  # 'PAGE'
                    args = {'id': region_id, 'Coords': coords}
                    if subcategory:
                        typedict = {
                            "TextRegion": TextTypeSimpleType,
                            "GraphicRegion": GraphicsTypeSimpleType,
                            "ChartType": ChartTypeSimpleType
                        }
                        if category in typedict:
                            subtype = membername(typedict[category],
                                                 subcategory)
                            if subtype == subcategory:
                                # not predefined in PAGE: use other + custom
                                args['custom'] = "subtype:%s" % subcategory
                                args['type_'] = "other"
                            else:
                                args['type_'] = subcategory
                        else:
                            args['custom'] = "subtype:%s" % subcategory
                    if category + 'Type' not in globals():
                        raise Exception('unknown region category: %s' %
                                        category)
                    region_type = globals()[category + 'Type']
                    if region_type is BorderType:
                        page.set_Border(BorderType(Coords=coords))
                    else:
                        region_obj = region_type(**args)
                        getattr(page, 'add_%s' % category)(region_obj)
            # remove image from dicts
            images_by_id.pop(num_page_id, None)
            images_by_filename.pop(page.imageFilename, None)

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))

        # warn of remaining COCO images
        if images_by_filename and not self.page_id:
            LOG.warning('%d images remain unaccounted for after processing',
                        len(images_by_filename))
            if LOG.isEnabledFor(logging.DEBUG):
                for filename in images_by_filename:
                    LOG.debug('not found in workspace: "%s"', filename)