Example #1
0
def _split_word_at_glyph(word, glyph):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh_glyph = xywh_from_points(glyph.get_Coords().points)
    xywh_word = xywh_from_points(word.get_Coords().points)
    xywh_prev = xywh_word.copy()
    xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh_word.copy()
    xywh_next.update({
        'x': xywh_glyph['x'] - xywh_glyph['w'],
        'w': xywh_word['w'] - xywh_prev['w']
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    glyphs = word.get_Glyph()
    pos = glyphs.index(glyph)
    prev_.set_Glyph(glyphs[0:pos])
    next_.set_Glyph(glyphs[pos + 1:])
    # TextEquiv: will be overwritten by page_update_higher_textequiv_levels
    return prev_, next_
Example #2
0
def _split_word_at_space(word):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh = xywh_from_points(word.get_Coords().points)
    textequiv = word.get_TextEquiv()[0]
    pos = textequiv.Unicode.index(" ")
    fract = pos / len(textequiv.Unicode)
    xywh_prev = xywh.copy()
    xywh_prev.update({'w': xywh['w'] * fract})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh.copy()
    xywh_next.update({
        'x': xywh['x'] + xywh['w'] * fract,
        'w': xywh['w'] * (1 - fract)
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    # Glyphs: irrelevant at this processing level
    textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos],
                                   conf=textequiv.conf)
    textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:],
                                   conf=textequiv.conf)
    prev_.set_TextEquiv([textequiv_prev])
    next_.set_TextEquiv([textequiv_next])
    return prev_, next_
Example #3
0
    def process(self):
        """
        Segment with ocropy
        """

        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            page_width = pcgts.get_Page().get_imageWidth()
            page_height = pcgts.get_Page().get_imageHeight()
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            binary = ocrolib.read_image_binary(
                self.workspace.download_url(image_url))
            binary = 1 - binary

            scale = self.parameter['scale'] if self.parameter[
                'scale'] != 0 else psegutils.estimate_scale(binary)
            log.debug(binary)

            pseg = self.compute_segmentation(binary, scale)
            log.debug("pseg=%s", pseg)

            # TODO reading order / enumber
            #  log.debug("finding reading order")
            #  lines = psegutils.compute_lines(pseg, scale)
            #  order = psegutils.reading_order([l.bounds for l in lines])
            #  lsort = psegutils.topsort(order)

            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)

            dummyRegion = TextRegionType(
                id="dummy",
                Coords=CoordsType(
                    points="0,0 %s,0 %s,%s 0,%s" %
                    (page_width, page_width, page_height, page_height)))
            pcgts.get_Page().add_TextRegion(dummyRegion)

            for lineno in range(1, regions.length()):
                log.debug("id=%s bbox=%s", regions.id(lineno),
                          regions.bbox(lineno))
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(
                        points=points_from_y0x0y1x1(regions.bbox(lineno))))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts))
Example #4
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
Example #5
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
Example #6
0
 def test_create_with_warning(self):
     ds = SeparatorRegionType(id='r6', Coords=CoordsType(points="239,1303 508,1303 899,1302 1626,1307 2441,1307 2444,1319 2414,1322 1664,1319 619,1317 235,1317 237,1302 235,1302"))
     with self.assertLogs('ocrd_browser.model.page_xml_renderer', level='WARNING') as log_watch:
         region = self.factory.create(ds)
     self.assertIsNotNone(region)
     self.assertRegex(log_watch.output[0], r'WARNING:ocrd_browser\.model\.page_xml_renderer\.RegionFactory:Page "DUMMY_0001" @ SeparatorRegion#r6 Self-intersection.+')
     self.assertRegex(region.warnings[0], r'Self-intersection.+')
Example #7
0
    def process(self):
        """
        Performs the cropping.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Cropping with tesseract")
                tessapi.SetImage(image)
                
                #
                # helper variables for saving the box coordinates
                #
                min_x = image.width
                min_y = image.height
                max_x = 0
                max_y = 0

                # iterate over all boxes and compare their extent
                # to the min and max values
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)

                    for pair in points.split(' '):
                        x, y = (int(pair.split(',')[0]), int(pair.split(',')[1]))
                        if x < min_x:
                            min_x = x
                        if y < min_y:
                            min_y = y
                        elif x > max_x:
                            max_x = x
                        elif y > max_y:
                            max_y = y
                    log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))

                #
                # set the identified page border
                #
                brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)))
                pcgts.get_Page().set_Border(brd)

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Example #8
0
 def _process_region(self, it, region, rogroup, region_image,
                     region_coords):
     LOG = getLogger('processor.TesserocrSegmentTable')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     index = 0
     if rogroup:
         for elem in (rogroup.get_RegionRefIndexed() +
                      rogroup.get_OrderedGroupIndexed() +
                      rogroup.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     while it and not it.Empty(RIL.BLOCK):
         bbox = it.BoundingBox(RIL.BLOCK)
         polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, region_image,
                                           region_coords)
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (but ignore non-text regions entirely)
         ID = region.id + "_%04d" % index
         subregion = TextRegionType(id=ID,
                                    Coords=coords,
                                    type=TextTypeSimpleType.PARAGRAPH)
         block_type = it.BlockType()
         if block_type == PT.FLOWING_TEXT:
             pass
         elif block_type == PT.HEADING_TEXT:
             subregion.set_type(TextTypeSimpleType.HEADING)
         elif block_type == PT.PULLOUT_TEXT:
             subregion.set_type(TextTypeSimpleType.FLOATING)
         elif block_type == PT.CAPTION_TEXT:
             subregion.set_type(TextTypeSimpleType.CAPTION)
         elif block_type == PT.VERTICAL_TEXT:
             subregion.set_orientation(90.0)
         else:
             it.Next(RIL.BLOCK)
             continue
         LOG.info("Detected cell '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         region.add_TextRegion(subregion)
         if rogroup:
             rogroup.add_RegionRefIndexed(
                 RegionRefIndexedType(regionRef=ID, index=index))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
    def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n):
        img_array = ocrolib.pil2array(page_image)

        # Check if image is RGB or not #FIXME: check not needed anymore?
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array,)*3, axis=-1)

        img_array_bin = np.array(
            img_array > ocrolib.midrange(img_array), 'i')

        lineDetectH = []
        lineDetectV = []
        img_array_rr = self.remove_rular(img_array)

        textarea, img_array_rr_ta, height, width = self.detect_textarea(
            img_array_rr)
        colSeparator = int(
            width * self.parameter['colSeparator'])
        if len(textarea) > 1:
            textarea = self.crop_area(
                textarea, img_array_bin, img_array_rr_ta, colSeparator)

            if len(textarea) == 0:
                min_x, min_y, max_x, max_y = self.select_borderLine(
                    img_array_rr, lineDetectH, lineDetectV)
            else:
                min_x, min_y, max_x, max_y = textarea[0]
        elif len(textarea) == 1 and (height*width*0.5 < (abs(textarea[0][2]-textarea[0][0]) * abs(textarea[0][3]-textarea[0][1]))):
            x1, y1, x2, y2 = textarea[0]
            x1 = x1-20 if x1 > 20 else 0
            x2 = x2+20 if x2 < width-20 else width
            y1 = y1-40 if y1 > 40 else 0
            y2 = y2+40 if y2 < height-40 else height

            min_x, min_y, max_x, max_y = textarea[0]
        else:
            min_x, min_y, max_x, max_y = self.select_borderLine(
                img_array_rr, lineDetectH, lineDetectV)

        border_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]
        border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh)
        border_points = points_from_polygon(border_polygon)
        brd = BorderType(Coords=CoordsType(border_points))
        page.set_Border(brd)

        page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y))
        page_xywh['features'] += ',cropped'

        file_id = make_file_id(input_file, self.output_file_grp)

        file_path = self.workspace.save_image_file(page_image,
                                                   file_id + '-IMG',
                                                   page_id=page_id,
                                                   file_grp=self.output_file_grp)
        page.add_AlternativeImage(AlternativeImageType(
            filename=file_path, comments=page_xywh['features']))
Example #10
0
 def test_create(self):
     ds = SeparatorRegionType(id='r6', Coords=CoordsType(points="0,0 0,1 1,1 1,0"))
     try:
         with self.assertLogs('ocrd_browser.model.page_xml_renderer', level='WARNING') as log_watch:
             region = self.factory.create(ds)
             raise ValueError('Dummy instead of assertNoLogs')
     except ValueError:
         pass
     self.assertEqual(len(log_watch.output), 0, '{:d} Warning(s) logged "{:s}'.format(len(log_watch.output), '\n'.join(log_watch.output)))
     self.assertIsInstance(region, Region)
     self.assertGreater(region.poly.area, 0)
Example #11
0
 def _process_words_in_line(self, result_it, line, line_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.WORD):
         LOG.warning("No text in line '%s'", line.id)
         return
     # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD):
     word_no = 0
     while result_it and not result_it.Empty(RIL.WORD):
         word_id = '%s_word%04d' % (line.id, word_no)
         LOG.debug("Decoding text in word '%s'", word_id)
         bbox = result_it.BoundingBox(RIL.WORD)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, line_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, line)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         word = WordType(id=word_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant word: %s', points)
         else:
             line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=word_attributes['bold']
                 if 'bold' in word_attributes else None,
                 italic=word_attributes['italic']
                 if 'italic' in word_attributes else None,
                 underlined=word_attributes['underlined']
                 if 'underlined' in word_attributes else None,
                 monospace=word_attributes['monospace']
                 if 'monospace' in word_attributes else None,
                 serif=word_attributes['serif']
                 if 'serif' in word_attributes else None)
             word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(TextEquivType(
             Unicode=result_it.GetUTF8Text(RIL.WORD),
             conf=result_it.Confidence(RIL.WORD)/100))
         if self.parameter['textequiv_level'] != 'word':
             self._process_glyphs_in_word(result_it, word, line_xywh)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             word_no += 1
             result_it.Next(RIL.WORD)
Example #12
0
 def _process_words_in_line(self, line, maxlevel, result_it):
     for word_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD)
         if not result_it:
             log.error("No iterator at '%s'", line.id)
             break
         if result_it.Empty(RIL.WORD):
             log.debug("No word here")
             break
         word_id = '%s_word%04d' % (line.id, word_no)
         log.debug("Recognizing text in word '%s'", word_id)
         word_bbox = result_it.BoundingBox(RIL.WORD)
         word = WordType(id=word_id,
                         Coords=CoordsType(points_from_x0y0x1y1(word_bbox)))
         line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=None if 'bold' not in word_attributes else
                 word_attributes['bold'],
                 italic=None if 'italic' not in word_attributes else
                 word_attributes['italic'],
                 underlined=None if 'underlined' not in word_attributes else
                 word_attributes['underlined'],
                 monospace=None if 'monospace' not in word_attributes else
                 word_attributes['monospace'],
                 serif=None if 'serif' not in word_attributes else
                 word_attributes['serif'])
             word.set_TextStyle(
                 word_style)  # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(
             TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD),
                           conf=result_it.Confidence(RIL.WORD) / 100))
         if maxlevel == 'word':
             pass
         else:
             self._process_glyphs_in_word(word, result_it)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             result_it.Next(RIL.WORD)
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names):

        img_array = ocrolib.pil2array(page_image)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        page_xywh['features'] += ',blksegmented'

        for i in range(len(r['rois'])):

            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            #small post-processing incase of paragrapgh to not cut last alphabets
            if (min_x - 5) > width and r['class_ids'][i] == 2:
                min_x -= 5
            if (max_x + 10) < width and r['class_ids'][i] == 2:
                min_x += 10

            # this can be tested, provided whether we need previous comments or not?

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  #extract from points and img_array
            region_img = ocrolib.array2pil(region_img)
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(region_img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)

            ai = AlternativeImageType(filename=file_path,
                                      comments=page_xywh['features'])
            coords = CoordsType(
                "%i,%i %i,%i %i,%i %i,%i" %
                (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))
            textregion = TextRegionType(Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            textregion.add_AlternativeImage(ai)
            page.add_TextRegion(textregion)
        def add_region(region: RectSegment, index: int, region_type: str):
            from ocrd_utils import coordinates_for_segment, points_from_polygon
            polygon = polygon_from_segment(region)
            polygon = coordinates_for_segment(polygon, page_image, page_coords)
            points = points_from_polygon(polygon)

            indexed_id = "region%04d" % index
            coords = CoordsType(points=points)
            if region_type == "text":
                page.add_TextRegion(
                    TextRegionType(id=indexed_id, Coords=coords))
            elif region_type == "image":
                page.add_ImageRegion(
                    ImageRegionType(id=indexed_id, Coords=coords))
            else:
                page.add_NoiseRegion(
                    NoiseRegionType(id=indexed_id, Coords=coords))
Example #15
0
    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)
                    # <pg:ReadingOrder>
                    ro = pcgts.get_Page().get_ReadingOrder()
                    if ro is None:
                        ro = ReadingOrderType()
                        pcgts.get_Page().set_ReadingOrder(ro)
                    # <pg:OrderedGroup>
                    og = ro.get_OrderedGroup()
                    if og is None:
                        og = OrderedGroupType(id="reading-order")
                        ro.set_OrderedGroup(og)
                    # <pg:RegionRefIndexed>
                    og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index))

                    #
                    #  text region
                    #
                    pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points)))

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Example #16
0
 def add_region(region: Segment, index: int, type: str):
     indexed_id = "region%04d" % index
     points = str([
         (region.x_start, region.y_start),
         (region.x_start, region.y_end),
         (region.x_end, region.y_start),
         (region.x_end, region.y_end),
     ])
     coords = CoordsType(points=points)
     if type == "text":
         page.add_TextRegion(
             TextRegionType(id=indexed_id, Coords=coords))
     elif type == "image":
         page.add_ImageRegion(
             ImageRegionType(id=indexed_id, Coords=coords))
     else:
         page.add_NoiseRegion(
             NoiseRegionType(id=indexed_id, Coords=coords))
Example #17
0
 def _process_glyphs_in_word(self, result_it, word, word_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.SYMBOL):
         LOG.debug("No glyph in word '%s'", word.id)
         return
     # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
     glyph_no = 0
     while result_it and not result_it.Empty(RIL.SYMBOL):
         glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
         LOG.debug("Decoding text in glyph '%s'", glyph_id)
         #  glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice?
         glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice?
         #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         bbox = result_it.BoundingBox(RIL.SYMBOL)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, word_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, word)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         glyph = GlyphType(id=glyph_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant glyph: %s', points)
         else:
             word.add_Glyph(glyph)
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence()/100
             #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or
                 choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
         if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
         else:
             glyph_no += 1
             result_it.Next(RIL.SYMBOL)
Example #18
0
    def process(self):
        """
        Segment with kraken
        """
        log = getLogger('processor.KrakenSegment')
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            im = self.workspace.resolve_image_as_pil(image_url)

            log.info('Segmenting')
            log.info('Params %s', self.parameter)
            res = segment(im, self.parameter['text_direction'],
                          self.parameter['scale'],
                          self.parameter['maxcolseps'],
                          self.parameter['black_colseps'])
            if self.parameter['script_detect']:
                res = detect_scripts(im, res)

            dummyRegion = TextRegionType()
            pcgts.get_Page().add_TextRegion(dummyRegion)
            #  print(res)
            for lineno, box in enumerate(res['boxes']):
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(points=points_from_x0y0x1y1(box)))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(self.output_file_grp,
                                    pageId=input_file.pageId,
                                    ID=ID,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))
Example #19
0
    def _process_page(page, page_image, page_xywh, pageId, file_id):
        settings = SegmentationSettings(debug=False,
                                        enable_preprocessing=False)
        # TODO: does this still need to be cropped or do we not need page_xywh?
        #       Same for points below
        #       page_image[page_xywh["x"]:page_xywh["w"], page_xywh["y"]:page_xywh["h"]]
        regions, classification = Segmentator(settings).segmentate_image(
            np.asarray(page_image))

        count = 0
        for region, prediction in zip(regions, classification):
            ID = "region%04d" % count
            points = str(list(region.exterior.coords))
            coords = CoordsType(points=points)
            # FIXME: these are not all types in the model, also check if they match
            if prediction == 1:
                page.add_TextRegion(TextRegionType(id=ID, Coords=coords))
            elif prediction == 2:
                page.add_ImageRegion(ImageRegionType(id=ID, Coords=coords))
            else:
                page.add_NoiseRegion(NoiseRegionType(id=ID, Coords=coords))
            count += 1
Example #20
0
 def _process_glyphs_in_word(self, word, result_it):
     for glyph_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL)
         if not result_it:
             log.error("No iterator at '%s'", word.id)
             break
         if result_it.Empty(RIL.SYMBOL):
             log.debug("No glyph here")
             break
         glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
         log.debug("Recognizing text in glyph '%s'", glyph_id)
         #  glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice?
         glyph_conf = result_it.Confidence(
             RIL.SYMBOL) / 100  # equals first choice?
         #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         glyph_bbox = result_it.BoundingBox(RIL.SYMBOL)
         glyph = GlyphType(id=glyph_id,
                           Coords=CoordsType(
                               points_from_x0y0x1y1(glyph_bbox)))
         word.add_Glyph(glyph)
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence() / 100
             #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF
                     or choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(
                 TextEquivType(index=choice_no,
                               Unicode=alternative_text,
                               conf=alternative_conf))
         if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
         else:
             result_it.Next(RIL.SYMBOL)
Example #21
0
def _merge_words(prev_, next_):
    merged = WordType(id=prev_.id + '.' + next_.id)
    merged.set_Coords(
        CoordsType(points=points_from_xywh(
            xywh_from_points(prev_.get_Coords().points + ' ' +
                             next_.get_Coords().points))))
    if prev_.get_language():
        merged.set_language(prev_.get_language())
    if prev_.get_TextStyle():
        merged.set_TextStyle(prev_.get_TextStyle())
    if prev_.get_Glyph() or next_.get_Glyph():
        merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph())
    if prev_.get_TextEquiv():
        merged.set_TextEquiv(prev_.get_TextEquiv())
    else:
        merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)])
    if next_.get_TextEquiv():
        textequiv = merged.get_TextEquiv()[0]
        textequiv2 = next_.get_TextEquiv()[0]
        textequiv.Unicode += textequiv2.Unicode
        if textequiv.conf and textequiv2.conf:
            textequiv.conf *= textequiv2.conf
    return merged
Example #22
0
 def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id):
     """Set the identified page border, if valid."""
     LOG = getLogger('processor.TesserocrCrop')
     left, top, right, bottom = bounds
     if left >= right or top >= bottom:
         LOG.error("Cannot find valid extent for page '%s'", page_id)
         return
     padding = self.parameter['padding']
     # add padding:
     left = max(left - padding, 0)
     right = min(right + padding, page_image.width)
     top = max(top - padding, 0)
     bottom = min(bottom + padding, page_image.height)
     LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom)
     polygon = polygon_from_bbox(left, top, right, bottom)
     polygon = coordinates_for_segment(polygon, page_image, page_xywh)
     polygon = polygon_for_parent(polygon, page)
     if polygon is None:
         LOG.error("Ignoring extant border")
         return
     border = BorderType(Coords=CoordsType(
         points_from_polygon(polygon)))
     # intersection with parent could have changed bbox,
     # so recalculate:
     bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh))
     # update PAGE (annotate border):
     page.set_Border(border)
     # update METS (add the image file):
     page_image = crop_image(page_image, box=bbox)
     page_xywh['features'] += ',cropped'
     file_path = self.workspace.save_image_file(
         page_image, file_id + '.IMG-CROP',
         page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     page.add_AlternativeImage(AlternativeImageType(
         filename=file_path, comments=page_xywh['features']))
Example #23
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names, mask):
        LOG = getLogger('OcrdAnybaseocrBlockSegmenter')
        # check for existing text regions and whether to overwrite them
        border = None
        if page.get_TextRegion():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextRegions in page "%s"', page_id)
                page.set_TextRegion([])
            else:
                LOG.warning('keeping existing TextRegions in page "%s"',
                            page_id)
                return
        # check if border exists
        if page.get_Border():
            border_coords = page.get_Border().get_Coords()
            border_points = polygon_from_points(border_coords.get_points())
            border = Polygon(border_points)


#            page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh)

        img_array = ocrolib.pil2array(page_image)
        page_image.save('./checkthis.png')
        if len(img_array.shape) <= 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        th = self.parameter['th']
        # check for existing semgentation mask
        # this code executes only when use_deeplr is set to True in ocrd-tool.json file
        if mask:
            mask = ocrolib.pil2array(mask)
            mask = mask // 255
            mask = 1 - mask
            # multiply all the bounding box part with 2
            for i in range(len(r['rois'])):

                min_x = r['rois'][i][0]
                min_y = r['rois'][i][1]
                max_x = r['rois'][i][2]
                max_y = r['rois'][i][3]
                mask[min_x:max_x, min_y:max_y] *= i + 2
            cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2)))

            # check for left over pixels and add them to the bounding boxes
            pixel_added = True

            while pixel_added:

                pixel_added = False
                left_over = np.where(mask == 1)
                for x, y in zip(left_over[0], left_over[1]):
                    local_mask = mask[x - th:x + th, y - th:y + th]
                    candidates = np.where(local_mask > 1)
                    candidates = [k for k in zip(candidates[0], candidates[1])]
                    if len(candidates) > 0:
                        pixel_added = True
                        # find closest pixel with x>1
                        candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 +
                                                              (j[1] - th)**2))
                        index = local_mask[candidates[0]] - 2

                        # add pixel to mask/bbox
                        # x,y to bbox with index
                        if x < r['rois'][index][0]:
                            r['rois'][index][0] = x

                        elif x > r['rois'][index][2]:
                            r['rois'][index][2] = x

                        if y < r['rois'][index][1]:
                            r['rois'][index][1] = y

                        elif y > r['rois'][index][3]:
                            r['rois'][index][3] = y

                        # update the mask
                        mask[x, y] = index + 2

        # resolving overlapping problem
        bbox_dict = {}  # to check any overlapping bbox
        class_id_check = []

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            region_bbox = [min_y, min_x, max_y, max_x]

            for key in bbox_dict:
                for bbox in bbox_dict[key]:

                    # checking for ymax case with vertical overlapping
                    # along with y, check both for xmax and xmin
                    if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][2] = bbox[1] - 1

                    # checking for ymin now
                    # along with y, check both for xmax and xmin
                    if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][0] = bbox[3] + 1

            if r['class_ids'][i] not in class_id_check:
                bbox_dict[r['class_ids'][i]] = []
                class_id_check.append(r['class_ids'][i])

            bbox_dict[r['class_ids'][i]].append(region_bbox)

        # resolving overlapping problem code

        # define reading order on basis of coordinates
        reading_order = []

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10
            reading_order.append((min_y, min_x, max_y, max_x))

        reading_order = sorted(reading_order,
                               key=lambda reading_order:
                               (reading_order[1], reading_order[0]))
        for i in range(len(reading_order)):
            min_y, min_x, max_y, max_x = reading_order[i]
            min_y = 0
            i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]])
            for j in range(i + 1, len(reading_order)):
                min_y, min_x, max_y, max_x = reading_order[j]
                j_poly = Polygon([[min_x, min_y], [max_x, min_y],
                                  [max_x, max_y], [min_x, max_y]])
                inter = i_poly.intersection(j_poly)
                if inter:
                    reading_order.insert(j + 1, reading_order[i])
                    del reading_order[i]

        # Creating Reading Order object in PageXML
        order_group = OrderedGroupType(caption="Regions reading order",
                                       id=page_id)

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]
            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]]

            if border:
                cut_region_polygon = border.intersection(
                    Polygon(region_polygon))
                if cut_region_polygon.is_empty:
                    continue
            else:
                cut_region_polygon = Polygon(region_polygon)

            order_index = reading_order.index((min_y, min_x, max_y, max_x))
            region_id = '%s_region%04d' % (page_id, i)
            regionRefIndex = RegionRefIndexedType(index=order_index,
                                                  regionRef=region_id)
            order_group.add_RegionRefIndexed(regionRefIndex)

        reading_order_object = ReadingOrderType()
        reading_order_object.set_OrderedGroup(order_group)
        page.set_ReadingOrder(reading_order_object)

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            # one change here to resolve flipped coordinates
            region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x],
                              [min_y, max_x]]

            cut_region_polygon = border.intersection(Polygon(region_polygon))

            if cut_region_polygon.is_empty:
                continue
            cut_region_polygon = [
                j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]),
                               list(cut_region_polygon.exterior.coords.xy[1]))
            ][:-1]

            # checking whether coordinates are flipped

            region_polygon = coordinates_for_segment(cut_region_polygon,
                                                     page_image, page_xywh)
            region_points = points_from_polygon(region_polygon)

            read_order = reading_order.index((min_y, min_x, max_y, max_x))

            # this can be tested, provided whether we need previous comments or not?
            # resolving overlapping problem

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  # extract from points and img_array

            region_img = ocrolib.array2pil(region_img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                region_img,
                file_id + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)

            # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features'])
            region_id = '%s_region%04d' % (page_id, i)
            coords = CoordsType(region_points)

            # incase of imageRegion
            if r['class_ids'][i] == 15:
                image_region = ImageRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # image_region.add_AlternativeImage(ai)
                page.add_ImageRegion(image_region)
                continue
            if r['class_ids'][i] == 16:
                table_region = TableRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # table_region.add_AlternativeImage(ai)
                page.add_TableRegion(table_region)
                continue
            if r['class_ids'][i] == 17:
                graphic_region = GraphicRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # graphic_region.add_AlternativeImage(ai)
                page.add_GraphicRegion(graphic_region)
                continue

            textregion = TextRegionType(custom='readingOrder {index:' +
                                        str(read_order) + ';}',
                                        id=region_id,
                                        Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            # textregion.add_AlternativeImage(ai)

            #border = page.get_Border()
            # if border:
            #    border.add_TextRegion(textregion)
            # else:
            page.add_TextRegion(textregion)
Example #24
0
    def process(self):
        """
        Performs the recognition.
        """

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(
                        self.predictor.predict_raw([line_image_np],
                                                   progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Example #25
0
    def process(self):
        """Performs word segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the textline level,
        and remove any existing Word elements (unless ``overwrite_words``
        is False).
        
        Set up Tesseract to detect words, and add each one to the line
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrSegmentWord')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_words = self.parameter['overwrite_words']

        with PyTessBaseAPI(
            psm=PSM.SINGLE_LINE,
            path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()
                
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in page.get_TextRegion():
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    for line in region.get_TextLine():
                        if line.get_Word():
                            if overwrite_words:
                                LOG.info('removing existing Words in line "%s"', line.id)
                                line.set_Word([])
                            else:
                                LOG.warning('keeping existing Words in line "%s"', line.id)
                        LOG.debug("Detecting words in line '%s'", line.id)
                        line_image, line_coords = self.workspace.image_from_segment(
                            line, region_image, region_coords)
                        tessapi.SetImage(line_image)
                        for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)):
                            word_id = '%s_word%04d' % (line.id, word_no)
                            word_polygon = polygon_from_xywh(component[1])
                            word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords)
                            word_polygon2 = polygon_for_parent(word_polygon, line)
                            if word_polygon2 is not None:
                                word_polygon = word_polygon2
                            word_points = points_from_polygon(word_polygon)
                            if word_polygon2 is None:
                                # could happen due to rotation
                                LOG.info('Ignoring extant word: %s', word_points)
                                continue
                            line.add_Word(WordType(
                                id=word_id, Coords=CoordsType(word_points)))
                            
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Example #26
0
def convert(cocofile, directory):
    """Convert MS-COCO JSON to METS/PAGE XML files.
    
    Load JSON ``cocofile`` (in MS-COCO format)
    and chdir to ``directory`` (which it refers to).
    
    Start a METS file mets.xml with references to
    the image files (under fileGrp ``OCR-D-IMG``)
    and their corresponding PAGE-XML annotations
    (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as
    parsed from ``cocofile`` and written using
    the same basename.
    """
    resolver = Resolver()
    with pushd_popd(directory):
        workspace = resolver.workspace_from_nothing('.')
        # https://github.com/ibm-aur-nlp/PubLayNet
        workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory
        coco = json.load(cocofile)
        LOG.info('Loaded JSON for %d images with %d regions in %d categories',
                 len(coco['images']), len(coco['annotations']),
                 len(coco['categories']))
        categories = dict()
        for cat in coco['categories']:
            categories[cat['id']] = cat['name']
        images = dict()
        for image in coco['images']:
            images[image['id']] = image
        for annotation in coco['annotations']:
            image = images[annotation['image_id']]
            regions = image.setdefault('regions', list())
            regions.append(annotation)
        del coco
        LOG.info('Parsing annotations into PAGE-XML')
        for image in images.values():
            page_id = 'p' + str(image['id'])
            file_base, file_ext = os.path.splitext(image['file_name'])
            filename = file_base + '.xml'
            image_file = workspace.add_file('OCR-D-IMG',
                                            ID='OCR-D-IMG_' + page_id,
                                            pageId=page_id,
                                            mimetype=EXT_TO_MIME[file_ext],
                                            local_filename=image['file_name'])
            LOG.info('Added page %s file %s of type %s', image_file.pageId,
                     image_file.local_filename, image_file.mimetype)
            pcgts = page_from_image(image_file)
            pcgts.set_pcGtsId(page_id)
            page = pcgts.get_Page()
            assert page.imageWidth == image['width']
            assert page.imageHeight == image['height']
            for region in image['regions']:
                polygon = np.array(region['segmentation'])
                polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2))
                coords = CoordsType(points=points_from_polygon(polygon))
                category = categories[region['category_id']]
                region_id = 'r' + str(region['id'])
                if category == 'text':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.PARAGRAPH)
                    page.add_TextRegion(region_obj)
                elif category == 'title':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.HEADING)  # CAPTION?
                    page.add_TextRegion(region_obj)
                elif category == 'list':
                    region_obj = TextRegionType(
                        id=region_id,
                        Coords=coords,
                        type_=TextTypeSimpleType.LISTLABEL)  # OTHER?
                    page.add_TextRegion(region_obj)
                elif category == 'table':
                    region_obj = TableRegionType(id=region_id, Coords=coords)
                    page.add_TableRegion(region_obj)
                elif category == 'figure':
                    region_obj = ImageRegionType(id=region_id, Coords=coords)
                    page.add_ImageRegion(region_obj)
                else:
                    raise Exception('unknown image category: %s' % category)
            page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK',
                                           ID='OCR-D-GT-SEG-BLOCK_' + page_id,
                                           pageId=page_id,
                                           mimetype=MIMETYPE_PAGE,
                                           local_filename=filename,
                                           content=to_xml(pcgts))
            LOG.info('Added page %s file %s with %d regions', page_file.pageId,
                     page_file.local_filename, len(image['regions']))
        LOG.info('All done')
        workspace.save_mets()
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            img = self.workspace.resolve_image_as_pil(fname)
            #fname = str(fname)
            print("Process file: ", fname)
            base, _ = ocrolib.allsplitext(fname)

            img_array = ocrolib.pil2array(img)
            img_array_bin = np.array(img_array > ocrolib.midrange(img_array),
                                     'i')

            lineDetectH = []
            lineDetectV = []
            img_array_rr = self.remove_rular(img_array)

            textarea, img_array_rr_ta, height, width = self.detect_textarea(
                img_array_rr)
            self.parameter['colSeparator'] = int(
                width * self.parameter['colSeparator'])

            if len(textarea) > 1:
                textarea = self.crop_area(textarea, img_array_bin,
                                          img_array_rr_ta)

                if len(textarea) == 0:
                    min_x, min_y, max_x, max_y = self.select_borderLine(
                        img_array_rr, lineDetectH, lineDetectV)
                else:
                    min_x, min_y, max_x, max_y = textarea[0]
            elif len(textarea) == 1 and (
                    height * width * 0.5 <
                (abs(textarea[0][2] - textarea[0][0]) *
                 abs(textarea[0][3] - textarea[0][1]))):
                x1, y1, x2, y2 = textarea[0]
                x1 = x1 - 20 if x1 > 20 else 0
                x2 = x2 + 20 if x2 < width - 20 else width
                y1 = y1 - 40 if y1 > 40 else 0
                y2 = y2 + 40 if y2 < height - 40 else height

                #self.save_pf(base, [x1, y1, x2, y2])
                min_x, min_y, max_x, max_y = textarea[0]
            else:
                min_x, min_y, max_x, max_y = self.select_borderLine(
                    img_array_rr, lineDetectH, lineDetectV)

            brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                               (min_x, min_y, max_x, min_y,
                                                max_x, max_y, min_x, max_y)))
            pcgts.get_Page().set_Border(brd)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Example #28
0
    def process_lines(self, textlines, predfiles, fgrp, regionid):

        for line in textlines:

            for file in predfiles:
                if file == '-'.join([fgrp, regionid, line.id]):
                    self.log.info("Processing text in line '%s'", line.id)

                    filepath = self.root + '/' + file + '.json'
                    with open(filepath) as f:
                        data = json.load(f)

                        linepred = data['predictions'][0]['sentence']
                        line_conf = []
                        line_pos = []

                        w = ''
                        word_conf = []
                        words = []
                        word_pos = []

                        positions = data['predictions'][0]['positions']
                        for i, d in enumerate(positions):
                            char = d['chars'][0]['char']
                            char_conf = d['chars'][0]['probability']
                            char_pos = (d['globalStart'], d['globalEnd'])

                            if char == ' ':
                                words.append(w)
                                w = ''
                                line_conf.append(word_conf)
                                word_conf = []
                                line_pos.append(word_pos)
                                word_pos = []
                            else:
                                w += char
                                word_conf.append(char_conf)
                                word_pos.append(char_pos)
                                if i == len(positions) - 1:
                                    words.append(w)
                                    line_conf.append(word_conf)
                                    line_pos.append(word_pos)

                        wconfs = [(min(conf) + max(conf)) / 2
                                  for conf in line_conf]
                        lineconf = (min(wconfs) + max(wconfs)) / 2

                        line.replace_TextEquiv_at(
                            0,
                            TextEquivType(Unicode=linepred,
                                          conf=str(lineconf)))

                        if self.maxlevel == 'word' or 'glyph':
                            box = bounding_box(line.get_Coords().points)
                            line.Word = []
                            for w_no, w in enumerate(words):

                                # Coords of word
                                wordbounding = (line_pos[w_no][0][0],
                                                line_pos[w_no][-1][-1])
                                word_bbox = [
                                    box[0] + wordbounding[0], box[1],
                                    box[2] + wordbounding[1], box[3]
                                ]

                                word_id = '%s_word%04d' % (line.id, w_no)
                                word = WordType(
                                    id=word_id,
                                    Coords=CoordsType(
                                        points_from_x0y0x1y1(word_bbox)))

                                line.add_Word(word)
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=w,
                                                  conf=str(wconfs[w_no])))

                                if self.maxlevel == 'glyph':
                                    for glyph_no, g in enumerate(w):
                                        glyphbounding = (
                                            line_pos[w_no][glyph_no][0],
                                            line_pos[w_no][glyph_no][-1])
                                        glyph_bbox = [
                                            box[0] + glyphbounding[0], box[1],
                                            box[2] + glyphbounding[1], box[3]
                                        ]

                                        glyph_id = '%s_glyph%04d' % (word.id,
                                                                     glyph_no)
                                        glyph = GlyphType(
                                            id=glyph_id,
                                            Coords=CoordsType(
                                                points_from_x0y0x1y1(
                                                    glyph_bbox)))

                                        word.add_Glyph(glyph)
                                        glyph.add_TextEquiv(
                                            TextEquivType(
                                                Unicode=g,
                                                conf=str(line_conf[w_no]
                                                         [glyph_no])))
Example #29
0
    def process(self):
        """
        Perform text recognition with Calamari on the workspace.

        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
        splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
        hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
        """
        log = getLogger('processor.CalamariRecognize')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector=self.features)

            for region in page.get_AllRegions(classes=['Text']):
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector=self.features)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                line_images_np = []
                line_coordss = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_selector=self.features)
                    if ('binarized' not in line_coords['features']
                            and 'grayscale_normalized'
                            not in line_coords['features']
                            and self.network_input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
                        log.warning(
                            "Using raw image for line '%s' in region '%s'",
                            line.id, region.id)

                    line_image = line_image if all(line_image.size) else [[0]]
                    line_image_np = np.array(line_image, dtype=np.uint8)
                    line_images_np.append(line_image_np)
                    line_coordss.append(line_coords)
                raw_results_all = self.predictor.predict_raw(
                    line_images_np, progress_bar=False)

                for line, line_coords, raw_results in zip(
                        textlines, line_coordss, raw_results_all):

                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            self.add_metadata(pcgts)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Example #30
0
    def _process_page(self, page, page_image, page_xywh, input_file, zoom=1.0):
        padding = self.parameter['padding']
        img_array = pil2array(page_image)
        # ensure RGB image
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        height, width, _ = img_array.shape
        size = height * width
        # zoom to 300 DPI (larger density: faster; most fixed parameters here expect 300)
        if zoom != 1.0:
            self.logger.info("scaling %dx%d image by %.2f", width, height,
                             zoom)
            img_array = cv2.resize(img_array,
                                   None,
                                   fx=zoom,
                                   fy=zoom,
                                   interpolation=cv2.INTER_CUBIC)

        # detect rule placed in image next to page for scale reference:
        mask_array, mask_box = self.detect_ruler(img_array)
        # detect page frame via line segment detector:
        border_polygon, prefer_border = self.select_borderLine(
            img_array, mask_box)
        border_polygon = np.array(border_polygon) / zoom  # unzoom
        # pad inwards:
        border_polygon = Polygon(border_polygon).buffer(
            -padding).exterior.coords[:-1]
        # get the bounding box from the border polygon:
        # min_x, min_y = border_polygon.min(axis=0)
        # max_x, max_y = border_polygon.max(axis=0)
        # get the inner rectangle from the border polygon:
        # _, min_x, max_x, _ = np.sort(border_polygon[:,0])
        # _, min_y, max_y, _ = np.sort(border_polygon[:,1])
        if prefer_border:
            self.logger.info("Preferring line detector")
        else:
            self.logger.info("Falling back to text detector")
            textboxes = self.detect_textboxes(img_array, mask_array)
            if len(textboxes) > 1:
                textboxes = self.merge_boxes(textboxes, img_array)
            textboxes = np.array(textboxes) / zoom  # unzoom

            if (len(textboxes) == 1 and self.parameter['columnAreaMin'] * size
                    < self.get_area(textboxes[0])):
                self.logger.info("Using text area (%d%% area)",
                                 100 * self.get_area(textboxes[0]) / size)
                min_x, min_y, max_x, max_y = textboxes[0]
                # pad outwards
                border_polygon = polygon_from_bbox(min_x - padding,
                                                   min_y - padding,
                                                   max_x + padding,
                                                   max_y + padding)

        def clip(point):
            x, y = point
            x = max(0, min(page_image.width, x))
            y = max(0, min(page_image.height, y))
            return x, y

        border_polygon = coordinates_for_segment(border_polygon, page_image,
                                                 page_xywh)
        border_polygon = list(map(clip, border_polygon))
        border_points = points_from_polygon(border_polygon)
        border = BorderType(Coords=CoordsType(border_points))
        page.set_Border(border)
        # get clipped relative coordinates for current image
        page_image, page_xywh, _ = self.workspace.image_from_page(
            page, input_file.pageId, fill='background', transparency=True)
        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            page_image,
            file_id + '.IMG-CROP',
            page_id=input_file.pageId,
            file_grp=self.output_file_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))