def test_alternative_image_additions():
    pcgts = PcGtsType(pcGtsId="foo")
    assert pcgts.pcGtsId == 'foo'

    # act
    # Page/AlternativeImage
    page = PageType()
    pcgts.set_Page(page)
    page.add_AlternativeImage(AlternativeImageType())
    # TextRegion/AlternativeImage
    region = TextRegionType()
    page.add_TextRegion(region)
    region.add_AlternativeImage(AlternativeImageType())
    # TextLine/AlternativeImage
    line = TextLineType()
    region.add_TextLine(line)
    line.add_AlternativeImage(AlternativeImageType())
    # Word/AlternativeImage
    word = WordType()
    line.add_Word(word)
    word.add_AlternativeImage(AlternativeImageType())
    # Glyph/AlternativeImage
    glyph = GlyphType()
    word.add_Glyph(glyph)
    glyph.add_AlternativeImage(AlternativeImageType())
Exemple #2
0
 def _process_segment(self, segment, segment_image, segment_coords,
                      segment_id, page_id, file_id):
     LOG = getLogger('processor.OcropyDeskew')
     angle0 = segment_coords[
         'angle']  # deskewing (w.r.t. top image) already applied to segment_image
     LOG.info("About to deskew %s", segment_id)
     angle = deskew(segment_image, maxskew=self.parameter['maxskew']
                    )  # additional angle to be applied
     # segment angle: PAGE orientation is defined clockwise,
     # whereas PIL/ndimage rotation is in mathematical direction:
     orientation = -(angle + angle0)
     orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
     segment.set_orientation(orientation)
     LOG.info("Found angle for %s: %.1f", segment_id, angle)
     if angle:
         LOG.debug("Rotating segment '%s' by %.2f°", segment_id, angle)
         segment_image = rotate_image(segment_image,
                                      angle,
                                      fill='background',
                                      transparency=True)
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(
         segment_image,
         file_id + '.IMG-DESKEW',
         page_id=page_id,
         file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     segment.add_AlternativeImage(
         AlternativeImageType(filename=file_path,
                              comments=segment_coords['features'] +
                              ',deskewed'))
    def _process_segment(self, model, dataset, page, page_xywh, page_id,
                         input_file, orig_img_size, n):
        for i, data in enumerate(dataset):
            w, h = orig_img_size
            generated = model.inference(data['label'], data['inst'],
                                        data['image'])
            dewarped = array(generated.data[0].permute(1, 2, 0).detach().cpu())
            bin_array = array(255 * (dewarped > ocrolib.midrange(dewarped)),
                              'B')
            dewarped = ocrolib.array2pil(bin_array)
            dewarped = dewarped.resize((w, h))

            page_xywh['features'] += ',dewarped'

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(
                dewarped,
                file_id,
                page_id=page_id,
                file_grp=self.image_grp,
                force=self.parameter['force'])
            page.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=page_xywh['features']))
Exemple #4
0
 def process_segment(self, segment, segment_mask, segment_polygon,
                     neighbours, background_image, parent_image,
                     parent_coords, parent_bin, page_id, file_id):
     LOG = getLogger('processor.OcropyClip')
     # initialize AlternativeImage@comments classes from parent, except
     # for those operations that can apply on multiple hierarchy levels:
     features = ','.join([
         feature
         for feature in parent_coords['features'].split(',') if feature in
         ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']
     ]) + ',clipped'
     # mask segment within parent image:
     segment_image = image_from_polygon(parent_image, segment_polygon)
     segment_bbox = bbox_from_polygon(segment_polygon)
     for neighbour, neighbour_mask in neighbours:
         if not np.any(segment_mask > neighbour_mask):
             LOG.info(
                 'Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"',
                 neighbour.id, segment.id, page_id)
             continue
         # find connected components that (only) belong to the neighbour:
         intruders = segment_mask * morph.keep_marked(
             parent_bin, neighbour_mask > 0)  # overlaps neighbour
         intruders = morph.remove_marked(
             intruders, segment_mask > neighbour_mask)  # but exclusively
         num_intruders = np.count_nonzero(intruders)
         num_foreground = np.count_nonzero(segment_mask * parent_bin)
         if not num_intruders:
             continue
         LOG.debug(
             'segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"',
             segment.id, neighbour.id, num_intruders, num_foreground,
             page_id)
         # suppress in segment_mask so these intruders can stay in the neighbours
         # (are not removed from both sides)
         segment_mask -= intruders
         # suppress in derived image result to be annotated
         clip_mask = array2pil(intruders)
         segment_image.paste(background_image,
                             mask=clip_mask)  # suppress in raw image
         if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']:
             # for consumers that do not have to rely on our
             # guessed background color, but can cope with transparency:
             segment_image.putalpha(ImageOps.invert(clip_mask))
     # recrop segment into rectangle, just as image_from_segment would do
     # (and also clipping with background colour):
     segment_image = crop_image(segment_image, box=segment_bbox)
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(
         segment_image,
         file_id=file_id + '.IMG-CLIP',
         page_id=page_id,
         file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     segment.add_AlternativeImage(
         AlternativeImageType(filename=file_path, comments=features))
    def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n):
        img_array = ocrolib.pil2array(page_image)

        # Check if image is RGB or not #FIXME: check not needed anymore?
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array,)*3, axis=-1)

        img_array_bin = np.array(
            img_array > ocrolib.midrange(img_array), 'i')

        lineDetectH = []
        lineDetectV = []
        img_array_rr = self.remove_rular(img_array)

        textarea, img_array_rr_ta, height, width = self.detect_textarea(
            img_array_rr)
        colSeparator = int(
            width * self.parameter['colSeparator'])
        if len(textarea) > 1:
            textarea = self.crop_area(
                textarea, img_array_bin, img_array_rr_ta, colSeparator)

            if len(textarea) == 0:
                min_x, min_y, max_x, max_y = self.select_borderLine(
                    img_array_rr, lineDetectH, lineDetectV)
            else:
                min_x, min_y, max_x, max_y = textarea[0]
        elif len(textarea) == 1 and (height*width*0.5 < (abs(textarea[0][2]-textarea[0][0]) * abs(textarea[0][3]-textarea[0][1]))):
            x1, y1, x2, y2 = textarea[0]
            x1 = x1-20 if x1 > 20 else 0
            x2 = x2+20 if x2 < width-20 else width
            y1 = y1-40 if y1 > 40 else 0
            y2 = y2+40 if y2 < height-40 else height

            min_x, min_y, max_x, max_y = textarea[0]
        else:
            min_x, min_y, max_x, max_y = self.select_borderLine(
                img_array_rr, lineDetectH, lineDetectV)

        border_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]
        border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh)
        border_points = points_from_polygon(border_polygon)
        brd = BorderType(Coords=CoordsType(border_points))
        page.set_Border(brd)

        page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y))
        page_xywh['features'] += ',cropped'

        file_id = make_file_id(input_file, self.output_file_grp)

        file_path = self.workspace.save_image_file(page_image,
                                                   file_id + '-IMG',
                                                   page_id=page_id,
                                                   file_grp=self.output_file_grp)
        page.add_AlternativeImage(AlternativeImageType(
            filename=file_path, comments=page_xywh['features']))
Exemple #6
0
 def test_alternativeImage(self):
     pcgts = PcGtsType(pcGtsId="foo")
     self.assertEqual(pcgts.pcGtsId, 'foo')
     # Page/AlternativeImage
     page = PageType()
     pcgts.set_Page(page)
     page.add_AlternativeImage(AlternativeImageType())
     # TextRegion/AlternativeImage
     region = TextRegionType()
     page.add_TextRegion(region)
     region.add_AlternativeImage(AlternativeImageType())
     # TextLine/AlternativeImage
     line = TextLineType()
     region.add_TextLine(line)
     line.add_AlternativeImage(AlternativeImageType())
     # Word/AlternativeImage
     word = WordType()
     line.add_Word(word)
     word.add_AlternativeImage(AlternativeImageType())
     # Glyph/AlternativeImage
     glyph = GlyphType()
     word.add_Glyph(glyph)
     glyph.add_AlternativeImage(AlternativeImageType())
Exemple #7
0
 def process_region(self, region, region_image, region_xywh, zoom, page_id,
                    file_id):
     if not region_image.width or not region_image.height:
         self.logger.warning("Skipping region '%s' with zero size",
                             region.id)
         return
     self.logger.info("About to binarize page '%s' region '%s'", page_id,
                      region.id)
     features = region_xywh['features']
     if 'angle' in region_xywh and region_xywh['angle']:
         # orientation has already been annotated (by previous deskewing),
         # so skip deskewing here:
         bin_image, _ = binarize(region_image,
                                 method=self.parameter['method'],
                                 maxskew=0,
                                 nrm=self.parameter['grayscale'],
                                 zoom=zoom)
     else:
         bin_image, angle = binarize(region_image,
                                     method=self.parameter['method'],
                                     maxskew=self.parameter['maxskew'],
                                     nrm=self.parameter['grayscale'],
                                     zoom=zoom)
         if angle:
             features += ',deskewed'
         region_xywh['angle'] = angle
     bin_image = remove_noise(bin_image,
                              maxsize=self.parameter['noise_maxsize'])
     if self.parameter['noise_maxsize']:
         features += ',despeckled'
     # annotate angle in PAGE (to allow consumers of the AlternativeImage
     # to do consistent coordinate transforms, and non-consumers
     # to redo the rotation themselves):
     orientation = -region_xywh['angle']
     orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
     region.set_orientation(orientation)
     # update METS (add the image file):
     if self.parameter['grayscale']:
         file_id += '.IMG-NRM'
         features += ',grayscale_normalized'
     else:
         file_id += '.IMG-BIN'
         features += ',binarized'
     file_path = self.workspace.save_image_file(
         bin_image, file_id, page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     region.add_AlternativeImage(
         AlternativeImageType(filename=file_path, comments=features))
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names):

        img_array = ocrolib.pil2array(page_image)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        page_xywh['features'] += ',blksegmented'

        for i in range(len(r['rois'])):

            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            #small post-processing incase of paragrapgh to not cut last alphabets
            if (min_x - 5) > width and r['class_ids'][i] == 2:
                min_x -= 5
            if (max_x + 10) < width and r['class_ids'][i] == 2:
                min_x += 10

            # this can be tested, provided whether we need previous comments or not?

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  #extract from points and img_array
            region_img = ocrolib.array2pil(region_img)
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(region_img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)

            ai = AlternativeImageType(filename=file_path,
                                      comments=page_xywh['features'])
            coords = CoordsType(
                "%i,%i %i,%i %i,%i %i,%i" %
                (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))
            textregion = TextRegionType(Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            textregion.add_AlternativeImage(ai)
            page.add_TextRegion(textregion)
Exemple #9
0
 def process_segment(self, segment, segment_image, segment_xywh, zoom,
                     page_id, file_id):
     LOG = getLogger('processor.OcropyDenoise')
     LOG.info("About to despeckle '%s'", file_id)
     bin_image = remove_noise(segment_image,
                              maxsize=self.parameter['noise_maxsize'] /
                              zoom * 300 / 72)  # in pt
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(
         bin_image,
         file_id + '.IMG-DESPECK',
         page_id=page_id,
         file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     segment.add_AlternativeImage(
         AlternativeImageType(filename=file_path,
                              comments=segment_xywh['features'] +
                              ',despeckled'))
Exemple #10
0
 def _process_segment(self, segment, segment_image, segment_coords,
                      segment_id, page_id, file_id):
     LOG = getLogger('processor.OcropyDeskew')
     if not segment_image.width or not segment_image.height:
         LOG.warning("Skipping %s with zero size", segment_id)
         return
     angle0 = segment_coords[
         'angle']  # deskewing (w.r.t. top image) already applied to segment_image
     LOG.info("About to deskew %s", segment_id)
     angle = deskew(segment_image, maxskew=self.parameter['maxskew']
                    )  # additional angle to be applied
     # segment angle: PAGE orientation is defined clockwise,
     # whereas PIL/ndimage rotation is in mathematical direction:
     orientation = -(angle + angle0)
     orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
     segment.set_orientation(
         orientation)  # also removes all deskewed AlternativeImages
     LOG.info("Found angle for %s: %.1f", segment_id, angle)
     # delegate reflection, rotation and re-cropping to core:
     if isinstance(segment, PageType):
         segment_image, segment_coords, _ = self.workspace.image_from_page(
             segment, page_id, fill='background', transparency=True)
     else:
         segment_image, segment_coords = self.workspace.image_from_segment(
             segment,
             segment_image,
             segment_coords,
             fill='background',
             transparency=True)
     if not angle:
         # zero rotation does not change coordinates,
         # but assures consuming processors that the
         # workflow had deskewing
         segment_coords['features'] += ',deskewed'
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(
         segment_image,
         file_id + '.IMG-DESKEW',
         page_id=page_id,
         file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     segment.add_AlternativeImage(
         AlternativeImageType(filename=file_path,
                              comments=segment_coords['features']))
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n):

        I = ocrolib.pil2array(page_image)
        if len(I.shape) > 2:
            I = np.mean(I, 2)
        I = 1 - I / I.max()
        rows, cols = I.shape

        # Generate Mask and Seed Images
        Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I)

        # Iseedfill: Union of Mask and Seed Images
        Iseedfill = self.pixSeedfillBinary(Imask, Iseed)

        # Dilation of Iseedfill
        mask = ones((3, 3))
        Iseedfill = ndimage.binary_dilation(Iseedfill, mask)

        # Expansion of Iseedfill to become equal in size of I
        Iseedfill = self.expansion(Iseedfill, (rows, cols))

        # Write Text and Non-Text images
        image_part = array((1 - I * Iseedfill), dtype=int)
        image_part[0, 0] = 0  # only for visualisation purpose
        text_part = array((1 - I * (1 - Iseedfill)), dtype=int)
        text_part[0, 0] = 0  # only for visualisation purpose

        page_xywh['features'] += ',tiseged'

        bin_array = array(255 * (text_part > ocrolib.midrange(text_part)), 'B')
        bin_image = ocrolib.array2pil(bin_array)

        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)
        file_path = self.workspace.save_image_file(bin_image,
                                                   file_id,
                                                   page_id=page_id,
                                                   file_grp=self.image_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
 def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id):
     LOG = getLogger('processor.TesserocrBinarize')
     tessapi.SetImage(image)
     image_bin = None
     layout = tessapi.AnalyseLayout()
     if layout:
         image_bin = layout.GetBinaryImage(ril)
     if not image_bin:
         LOG.error('Cannot binarize %s', where)
         return
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(image_bin,
                                 file_id + '.IMG-BIN',
                                 page_id=page_id,
                                 file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     features = xywh['features'] + ",binarized"
     segment.add_AlternativeImage(AlternativeImageType(
         filename=file_path, comments=features))
Exemple #13
0
 def process_line(self, line, line_image, line_xywh, zoom, page_id,
                  region_id, file_id):
     if not line_image.width or not line_image.height:
         self.logger.warning("Skipping line '%s' with zero size", line.id)
         return
     self.logger.info("About to binarize page '%s' region '%s' line '%s'",
                      page_id, region_id, line.id)
     features = line_xywh['features']
     bin_image, angle = binarize(line_image,
                                 method=self.parameter['method'],
                                 maxskew=self.parameter['maxskew'],
                                 nrm=self.parameter['grayscale'],
                                 zoom=zoom)
     if angle:
         features += ',deskewed'
     # annotate angle in PAGE (to allow consumers of the AlternativeImage
     # to do consistent coordinate transforms, and non-consumers
     # to redo the rotation themselves):
     #orientation = -angle
     #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
     #line.set_orientation(orientation) # does not exist on line level!
     self.logger.warning(
         "cannot add orientation %.2f to page '%s' region '%s' line '%s'",
         -angle, page_id, region_id, line.id)
     bin_image = remove_noise(bin_image,
                              maxsize=self.parameter['noise_maxsize'])
     if self.parameter['noise_maxsize']:
         features += ',despeckled'
     # update METS (add the image file):
     if self.parameter['grayscale']:
         file_id += '.IMG-NRM'
         features += ',grayscale_normalized'
     else:
         file_id += '.IMG-BIN'
         features += ',binarized'
     file_path = self.workspace.save_image_file(
         bin_image, file_id, page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     line.add_AlternativeImage(
         AlternativeImageType(filename=file_path, comments=features))
 def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id):
     """Set the identified page border, if valid."""
     LOG = getLogger('processor.TesserocrCrop')
     left, top, right, bottom = bounds
     if left >= right or top >= bottom:
         LOG.error("Cannot find valid extent for page '%s'", page_id)
         return
     padding = self.parameter['padding']
     # add padding:
     left = max(left - padding, 0)
     right = min(right + padding, page_image.width)
     top = max(top - padding, 0)
     bottom = min(bottom + padding, page_image.height)
     LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom)
     polygon = polygon_from_bbox(left, top, right, bottom)
     polygon = coordinates_for_segment(polygon, page_image, page_xywh)
     polygon = polygon_for_parent(polygon, page)
     if polygon is None:
         LOG.error("Ignoring extant border")
         return
     border = BorderType(Coords=CoordsType(
         points_from_polygon(polygon)))
     # intersection with parent could have changed bbox,
     # so recalculate:
     bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh))
     # update PAGE (annotate border):
     page.set_Border(border)
     # update METS (add the image file):
     page_image = crop_image(page_image, box=bbox)
     page_xywh['features'] += ',cropped'
     file_path = self.workspace.save_image_file(
         page_image, file_id + '.IMG-CROP',
         page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     page.add_AlternativeImage(AlternativeImageType(
         filename=file_path, comments=page_xywh['features']))
    def _process_segment(self, page, page_image, page_coords, page_id,
                         input_file):
        LOG = getLogger('OcrdAnybaseocrTiseg')

        if self.model:

            I = ocrolib.pil2array(
                page_image.resize((800, 1024), Image.ANTIALIAS))
            I = np.array(I)[np.newaxis, :, :, :]
            LOG.info('I shape %s', I.shape)
            if len(I.shape) < 3:
                print('Wrong input shape. Image should have 3 channel')

            # get prediction
            #out = self.model.predict_segmentation(
            #    inp=I,
            #    out_fname="/tmp/out.png"
            #)
            out = self.model.predict(I)
            out = out.reshape((2048, 1600, 3)).argmax(axis=2)

            text_part = 255 * np.ones(out.shape, 'B')
            text_part[np.where(out == 1)] = 0
            LOG.info(
                'text: %d percent',
                100 * (1 - np.count_nonzero(text_part) / np.prod(out.shape)))

            image_part = 255 * np.ones(out.shape, 'B')
            image_part[np.where(out == 2)] = 0
            LOG.info(
                'image: %d percent',
                100 * (1 - np.count_nonzero(image_part) / np.prod(out.shape)))

            image_part = ocrolib.array2pil(image_part)
            text_part = ocrolib.array2pil(text_part)

            image_part = image_part.resize(page_image.size, Image.BICUBIC)
            text_part = text_part.resize(page_image.size, Image.BICUBIC)

        else:
            I = ocrolib.pil2array(page_image)

            if len(I.shape) > 2:
                I = np.mean(I, 2)
            I = 1 - I / I.max()
            rows, cols = I.shape

            # Generate Mask and Seed Images
            Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I)

            # Iseedfill: Union of Mask and Seed Images
            Iseedfill = self.pixSeedfillBinary(Imask, Iseed)

            # Dilation of Iseedfill
            mask = np.ones((3, 3))
            Iseedfill = ndimage.binary_dilation(Iseedfill, mask)

            # Expansion of Iseedfill to become equal in size of I
            Iseedfill = self.expansion(Iseedfill, (rows, cols))

            # Write Text and Non-Text images
            image_part = np.array(255 * (1 - I * Iseedfill), dtype='B')
            text_part = np.array(255 * (1 - I * (1 - Iseedfill)), dtype='B')
            LOG.info(
                'text: %d percent',
                100 * (1 - np.count_nonzero(text_part) / np.prod(I.shape)))
            LOG.info(
                'image: %d percent',
                100 * (1 - np.count_nonzero(image_part) / np.prod(I.shape)))

            image_part = ocrolib.array2pil(image_part)
            text_part = ocrolib.array2pil(text_part)

        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            image_part,
            file_id + "_img",
            page_id=input_file.pageId,
            file_grp=self.output_file_grp,
        )
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_coords['features'] +
                                 ',non_text'))

        file_path = self.workspace.save_image_file(
            text_part,
            file_id + "_txt",
            page_id=input_file.pageId,
            file_grp=self.output_file_grp,
        )
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_coords['features'] +
                                 ',clipped'))
 def _process_segment(self, tessapi, segment, image, xywh, where, page_id,
                      file_id):
     LOG = getLogger('processor.TesserocrDeskew')
     if not image.width or not image.height:
         LOG.warning("Skipping %s with zero size", where)
         return
     angle0 = xywh[
         'angle']  # deskewing (w.r.t. top image) already applied to image
     angle = 0.  # additional angle to be applied at current level
     tessapi.SetImage(image)
     #tessapi.SetPageSegMode(PSM.AUTO_OSD)
     #
     # orientation/script
     #
     osr = tessapi.DetectOrientationScript()
     if osr:
         assert not math.isnan(osr['orient_conf']), \
             "orientation detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)"
         if osr['orient_conf'] < self.parameter[
                 'min_orientation_confidence']:
             LOG.info(
                 'ignoring OSD orientation result %d° clockwise due to low confidence %.0f in %s',
                 osr['orient_deg'], osr['orient_conf'], where)
         else:
             LOG.info(
                 'applying OSD orientation result %d° clockwise with high confidence %.0f in %s',
                 osr['orient_deg'], osr['orient_conf'], where)
             # defined as 'the detected clockwise rotation of the input image'
             # i.e. the same amount to be applied counter-clockwise for deskewing:
             angle = osr['orient_deg']
         assert not math.isnan(osr['script_conf']), \
             "script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)"
         if osr['script_conf'] < 10:
             LOG.info(
                 'ignoring OSD script result "%s" due to low confidence %.0f in %s',
                 osr['script_name'], osr['script_conf'], where)
         else:
             LOG.info(
                 'applying OSD script result "%s" with high confidence %.0f in %s',
                 osr['script_name'], osr['script_conf'], where)
             if isinstance(segment, (TextRegionType, PageType)):
                 segment.set_primaryScript({
                     "Arabic": "Arab - Arabic",
                     "Armenian": "Armn - Armenian",
                     "Bengali": "Armn - Armenian",
                     "Canadian_Aboriginal":
                     "Cans - Unified Canadian Aboriginal Syllabics",
                     "Cherokee": "Cher - Cherokee",
                     "Common": "Latn - Latin",  # not in scripts/
                     "Cyrillic": "Cyrl - Cyrillic",
                     "Devanagari": "Deva - Devanagari (Nagari)",
                     "Ethiopic": "Ethi - Ethiopic",
                     "Fraktur": "Latf - Latin (Fraktur variant)",
                     "Georgian": "Geor - Georgian (Mkhedruli)",
                     "Greek": "Grek - Greek",
                     "Gujarati": "Gujr - Gujarati",
                     "Gurmukhi": "Guru - Gurmukhi",
                     "Han":
                     "Hant - Han (Traditional variant)",  # not in scripts/
                     "Hangul": "Hang - Hangul",
                     "Hangul_vert": "Hang - Hangul",
                     "HanS": "Hans - Han (Simplified variant)",
                     "HanS_vert": "Hans - Han (Simplified variant)",
                     "HanT": "Hant - Han (Traditional variant)",
                     "HanT_vert": "Hant - Han (Traditional variant)",
                     "Hebrew": "Hebr - Hebrew",
                     "Hiragana": "Jpan - Japanese",  # not in scripts/
                     "Japanese": "Jpan - Japanese",
                     "Japanese_vert": "Jpan - Japanese",
                     "Kannada": "Knda - Kannada",
                     "Katakana": "Jpan - Japanese",  # not in scripts/
                     "Khmer": "Khmr - Khmer",
                     "Lao": "Laoo - Lao",
                     "Latin": "Latn - Latin",
                     "Malayalam": "Mlym - Malayalam",
                     "Myanmar": "Mymr - Myanmar (Burmese)",
                     "Oriya": "Orya - Oriya",
                     "Sinhala": "Sinh - Sinhala",
                     "Syriac": "Syrc - Syriac",
                     "Tamil": "Taml - Tamil",
                     "Telugu": "Telu - Telugu",
                     "Thaana": "Thaa - Thaana",
                     "Thai": "Thai - Thai",
                     "Tibetan": "Tibt - Tibetan",
                     "Vietnamese": "Tavt - Tai Viet",
                 }.get(osr['script_name'], "Latn - Latin"))
     else:
         LOG.warning('no OSD result in %s', where)
     #
     # orientation/skew
     #
     layout = tessapi.AnalyseLayout()
     if not layout:
         LOG.warning('no result iterator in %s', where)
         return
     orientation, writing_direction, textline_order, deskew_angle = layout.Orientation(
     )
     if isinstance(segment, (TextRegionType, PageType)):
         segment.set_readingDirection({
             WritingDirection.LEFT_TO_RIGHT:
             'left-to-right',
             WritingDirection.RIGHT_TO_LEFT:
             'right-to-left',
             WritingDirection.TOP_TO_BOTTOM:
             'top-to-bottom'
         }.get(writing_direction, 'bottom-to-top'))
         segment.set_textLineOrder({
             TextlineOrder.LEFT_TO_RIGHT:
             'left-to-right',
             TextlineOrder.RIGHT_TO_LEFT:
             'right-to-left',
             TextlineOrder.TOP_TO_BOTTOM:
             'top-to-bottom'
         }.get(textline_order, 'bottom-to-top'))
     # baseline = layout.Baseline(RIL.BLOCK)
     # if baseline:
     #     points = points_from_x0y0x1y1(list(baseline[0]) + list(baseline[1]))
     #     segment.add_Baseline(BaselineType(points=points))
     # defined as 'how many radians does one have to rotate the block anti-clockwise'
     # i.e. positive amount to be applied counter-clockwise for deskewing:
     deskew_angle *= 180 / math.pi
     LOG.info('orientation/deskewing for %s: %s / %s / %s / %.3f°', where,
              membername(Orientation, orientation),
              membername(WritingDirection, writing_direction),
              membername(TextlineOrder, textline_order), deskew_angle)
     # defined as 'the amount of clockwise rotation to be applied to the input image'
     # i.e. the negative amount to be applied counter-clockwise for deskewing:
     # (as defined in Tesseract OrientationIdToValue):
     angle2 = {
         Orientation.PAGE_RIGHT: 90,
         Orientation.PAGE_DOWN: 180,
         Orientation.PAGE_LEFT: 270
     }.get(orientation, 0)
     if angle2 != angle:
         # This effectively ignores Orientation from AnalyseLayout,
         # because it is usually wrong when it deviates from OSD results.
         # (We do keep deskew_angle, though – see below.)
         LOG.warning(
             'inconsistent angles from layout analysis (%d) and orientation detection (%d) in %s',
             angle2, angle, where)
     # annotate result:
     angle += deskew_angle
     # page angle: PAGE @orientation is defined clockwise,
     # whereas PIL/ndimage rotation is in mathematical direction:
     orientation = -(angle + angle0)
     orientation = 180 - (180 - orientation) % 360  # map to [-179.999,180]
     segment.set_orientation(
         orientation)  # also removes all deskewed AlternativeImages
     # Tesseract layout analysis already rotates the image, even for each
     # sub-segment (depending on RIL), but the accuracy is not as good
     # as setting the image to the sub-segments and running without iterator.
     # (These images can be queried via GetBinaryImage/GetImage, cf. segment_region)
     # Unfortunately, it does _not_ use expand=True, but chops off corners.
     # So we must do it here from the original image ourselves.
     # We can delegate to OCR-D core for reflection, deskewing and re-cropping:
     if isinstance(segment, PageType):
         image, xywh, _ = self.workspace.image_from_page(segment,
                                                         page_id,
                                                         fill='background',
                                                         transparency=True)
     else:
         image, xywh = self.workspace.image_from_segment(segment,
                                                         image,
                                                         xywh,
                                                         fill='background',
                                                         transparency=True)
     if not angle:
         # zero rotation does not change coordinates,
         # but assures consuming processors that the
         # workflow had deskewing
         xywh['features'] += ',deskewed'
     features = xywh['features']  # features already applied to image
     # update METS (add the image file):
     file_path = self.workspace.save_image_file(
         image,
         file_id + '.IMG-DESKEW',
         page_id=page_id,
         file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     segment.add_AlternativeImage(
         AlternativeImageType(filename=file_path, comments=features))
Exemple #17
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, model):

        I = ocrolib.pil2array(page_image)
        LOG.info('image size: %s', page_image.size)

        if model:

            if len(I.shape) < 3:
                print('Wrong input shape. Image should have 3 channel')

            # get prediction
            out = model.predict_segmentation(inp=I, out_fname="/tmp/out.png")
            cv2.imwrite('out_image.png', out * (255 / 2))
            text_part = np.ones(out.shape)
            text_part[np.where(out == 1)] = 0

            image_part = np.ones(out.shape)
            image_part[np.where(out == 2)] = 0

            image_part = array(255 * (image_part), 'B')
            image_part = ocrolib.array2pil(image_part)

            text_part = array(255 * (text_part), 'B')
            text_part = ocrolib.array2pil(text_part)

            text_part = text_part.resize(page_image.size, Image.BICUBIC)
            image_part = image_part.resize(page_image.size, Image.BICUBIC)

        else:

            if len(I.shape) > 2:
                I = np.mean(I, 2)
            I = 1 - I / I.max()
            rows, cols = I.shape

            # Generate Mask and Seed Images
            Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I)

            # Iseedfill: Union of Mask and Seed Images
            Iseedfill = self.pixSeedfillBinary(Imask, Iseed)

            # Dilation of Iseedfill
            mask = ones((3, 3))
            Iseedfill = ndimage.binary_dilation(Iseedfill, mask)

            # Expansion of Iseedfill to become equal in size of I
            Iseedfill = self.expansion(Iseedfill, (rows, cols))

            # Write Text and Non-Text images
            image_part = array((1 - I * Iseedfill), dtype=int)
            text_part = array((1 - I * (1 - Iseedfill)), dtype=int)

            bin_array = array(255 * (text_part > ocrolib.midrange(img_part)),
                              'B')
            text_part = ocrolib.array2pil(bin_array)

            bin_array = array(255 * (text_part > ocrolib.midrange(text_part)),
                              'B')
            image_part = ocrolib.array2pil(bin_array)

        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)
        file_path = self.workspace.save_image_file(
            image_part,
            file_id + "_img",
            page_id=page_id,
            file_grp=self.image_grp,
            force=self.parameter['force'])
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features'] + ',non_text'))

        page_xywh['features'] += ',clipped'
        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)
        file_path = self.workspace.save_image_file(
            text_part,
            file_id + "_txt",
            page_id=page_id,
            file_grp=self.image_grp,
            force=self.parameter['force'])
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Exemple #18
0
    def _process_segment(self, page, filename, page_id, file_id):
        raw = ocrolib.read_image_gray(filename)
        self.dshow(raw, "input")

        # perform image normalization
        image = raw - amin(raw)
        if amax(image) == amin(image):
            LOG.info("# image is empty: %s" % (page_id))
            return
        image /= amax(image)

        if not self.parameter['nocheck']:
            check = self.check_page(amax(image) - image)
            if check is not None:
                LOG.error(input_file.pageId or input_file.ID + " SKIPPED. " +
                          check + " (use -n to disable this check)")
                return

        # check whether the image is already effectively binarized
        if self.parameter['gray']:
            extreme = 0
        else:
            extreme = (np.sum(image < 0.05) +
                       np.sum(image > 0.95)) * 1.0 / np.prod(image.shape)
        if extreme > 0.95:
            comment = "no-normalization"
            flat = image
        else:
            comment = ""
            # if not, we need to flatten it by estimating the local whitelevel
            LOG.info("Flattening")
            m = interpolation.zoom(image, self.parameter['zoom'])
            m = filters.percentile_filter(m,
                                          self.parameter['perc'],
                                          size=(self.parameter['range'], 2))
            m = filters.percentile_filter(m,
                                          self.parameter['perc'],
                                          size=(2, self.parameter['range']))
            m = interpolation.zoom(m, 1.0 / self.parameter['zoom'])
            if self.parameter['debug'] > 0:
                clf()
                imshow(m, vmin=0, vmax=1)
                ginput(1, self.parameter['debug'])
            w, h = minimum(array(image.shape), array(m.shape))
            flat = clip(image[:w, :h] - m[:w, :h] + 1, 0, 1)
            if self.parameter['debug'] > 0:
                clf()
                imshow(flat, vmin=0, vmax=1)
                ginput(1, self.parameter['debug'])

        # estimate low and high thresholds
        LOG.info("Estimating Thresholds")
        d0, d1 = flat.shape
        o0, o1 = int(self.parameter['bignore'] * d0), int(
            self.parameter['bignore'] * d1)
        est = flat[o0:d0 - o0, o1:d1 - o1]
        if self.parameter['escale'] > 0:
            # by default, we use only regions that contain
            # significant variance; this makes the percentile
            # based low and high estimates more reliable
            e = self.parameter['escale']
            v = est - filters.gaussian_filter(est, e * 20.0)
            v = filters.gaussian_filter(v**2, e * 20.0)**0.5
            v = (v > 0.3 * amax(v))
            v = morphology.binary_dilation(v, structure=ones((int(e * 50), 1)))
            v = morphology.binary_dilation(v, structure=ones((1, int(e * 50))))
            if self.parameter['debug'] > 0:
                imshow(v)
                ginput(1, self.parameter['debug'])
            est = est[v]
        lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo'])
        hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi'])
        # rescale the image to get the gray scale image
        LOG.info("Rescaling")
        flat -= lo
        flat /= (hi - lo)
        flat = clip(flat, 0, 1)
        if self.parameter['debug'] > 0:
            imshow(flat, vmin=0, vmax=1)
            ginput(1, self.parameter['debug'])
        binarized = 1 * (flat > self.parameter['threshold'])

        # output the normalized grayscale and the thresholded images
        # print_info("%s lo-hi (%.2f %.2f) angle %4.1f %s" % (fname, lo, hi, angle, comment))
        LOG.info("%s lo-hi (%.2f %.2f) %s" % (page_id, lo, hi, comment))
        LOG.info("writing")
        if self.parameter['debug'] > 0 or self.parameter['show']:
            clf()
            gray()
            imshow(binarized)
            ginput(1, max(0.1, self.parameter['debug']))
        #base, _ = ocrolib.allsplitext(filename)
        #ocrolib.write_image_binary(base + ".bin.png", binarized)
        # ocrolib.write_image_gray(base +".nrm.png", flat)
        # print("########### File path : ", base+".nrm.png")
        # write_to_xml(base+".bin.png")
        # return base+".bin.png"

        bin_array = array(255 * (binarized > ocrolib.midrange(binarized)), 'B')
        bin_image = ocrolib.array2pil(bin_array)

        file_path = self.workspace.save_image_file(bin_image,
                                                   file_id,
                                                   page_id=page_id,
                                                   file_grp=self.image_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path, comment="binarized"))
Exemple #19
0
    def _process_element(self,
                         element,
                         ignore,
                         image,
                         coords,
                         element_id,
                         file_id,
                         page_id,
                         zoom=1.0,
                         rogroup=None):
        """Add PAGE layout elements by segmenting an image.

        Given a PageType, TableRegionType or TextRegionType ``element``, and
        a corresponding binarized PIL.Image object ``image`` with coordinate
        metadata ``coords``, run line segmentation with Ocropy.
        
        If operating on the full page (or table), then also detect horizontal
        and vertical separators, and aggregate the lines into text regions
        afterwards.
        
        Add the resulting sub-segments to the parent ``element``.
        
        If ``ignore`` is not empty, then first suppress all foreground components
        in any of those segments' coordinates during segmentation, and if also
        in full page/table mode, then combine all separators among them with the
        newly detected separators to guide region segmentation.
        """
        LOG = getLogger('processor.OcropySegment')
        if not image.width or not image.height:
            LOG.warning("Skipping '%s' with zero size", element_id)
            return
        element_array = pil2array(image)
        element_bin = np.array(element_array <= midrange(element_array),
                               np.bool)
        sep_bin = np.zeros_like(element_bin, np.bool)
        ignore_labels = np.zeros_like(element_bin, np.int)
        for i, segment in enumerate(ignore):
            LOG.debug('masking foreground of %s "%s" for "%s"',
                      type(segment).__name__[:-4], segment.id, element_id)
            # mark these segments (e.g. separator regions, tables, images)
            # for workflows where they have been detected already;
            # these will be:
            # - ignored during text line segmentation (but not h/v-line detection)
            # - kept and reading-ordered during region segmentation (but not seps)
            segment_polygon = coordinates_of_segment(segment, image, coords)
            # If segment_polygon lies outside of element (causing
            # negative/above-max indices), either fully or partially,
            # then this will silently ignore them. The caller does
            # not need to concern herself with this.
            if isinstance(segment, SeparatorRegionType):
                sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                            0],
                                     sep_bin.shape)] = True
            ignore_labels[draw.polygon(
                segment_polygon[:, 1], segment_polygon[:, 0],
                ignore_labels.shape)] = i + 1  # mapped back for RO
        if isinstance(element, PageType):
            element_name = 'page'
            fullpage = True
            report = check_page(element_bin, zoom)
        elif isinstance(element, TableRegionType) or (
                # sole/congruent text region of a table region?
                element.id.endswith('_text')
                and isinstance(element.parent_object_, TableRegionType)):
            element_name = 'table'
            fullpage = True
            report = check_region(element_bin, zoom)
        else:
            element_name = 'region'
            fullpage = False
            report = check_region(element_bin, zoom)
        LOG.info('computing line segmentation for %s "%s"', element_name,
                 element_id)
        # TODO: we should downscale if DPI is large enough to save time
        try:
            if report:
                raise Exception(report)
            line_labels, hlines, vlines, images, colseps, scale = compute_segmentation(
                # suppress separators and ignored regions for textline estimation
                # but keep them for h/v-line detection (in fullpage mode):
                element_bin,
                seps=(sep_bin + ignore_labels) > 0,
                zoom=zoom,
                fullpage=fullpage,
                spread_dist=round(self.parameter['spread'] / zoom * 300 /
                                  72),  # in pt
                # these are ignored when not in fullpage mode:
                maxcolseps=self.parameter['maxcolseps'],
                maxseps=self.parameter['maxseps'],
                maximages=self.parameter['maximages']
                if element_name != 'table' else 0,
                csminheight=self.parameter['csminheight'],
                hlminwidth=self.parameter['hlminwidth'])
        except Exception as err:
            if isinstance(element, TextRegionType):
                LOG.error('Cannot line-segment region "%s": %s', element_id,
                          err)
                # as a fallback, add a single text line comprising the whole region:
                element.add_TextLine(
                    TextLineType(id=element_id + "_line",
                                 Coords=element.get_Coords()))
            else:
                LOG.error('Cannot line-segment %s "%s": %s', element_name,
                          element_id, err)
            return

        LOG.info('Found %d text lines for %s "%s"',
                 len(np.unique(line_labels)) - 1, element_name, element_id)
        # post-process line labels
        if isinstance(element, (PageType, TableRegionType)):
            # aggregate text lines to text regions
            try:
                # pass ignored regions as "line labels with initial assignment",
                # i.e. identical line and region labels
                # to detect their reading order among the others
                # (these cannot be split or grouped together with other regions)
                line_labels = np.where(line_labels, line_labels + len(ignore),
                                       ignore_labels)
                # suppress separators/images in fg and try to use for partitioning slices
                sepmask = np.maximum(np.maximum(hlines, vlines),
                                     np.maximum(sep_bin, images))
                region_labels = lines2regions(
                    element_bin,
                    line_labels,
                    rlabels=ignore_labels,
                    sepmask=np.maximum(sepmask, colseps),  # add bg
                    # decide horizontal vs vertical cut when gaps of similar size
                    prefer_vertical=not isinstance(element, TableRegionType),
                    gap_height=self.parameter['gap_height'],
                    gap_width=self.parameter['gap_width'],
                    scale=scale,
                    zoom=zoom)
                LOG.info('Found %d text regions for %s "%s"',
                         len(np.unique(region_labels)) - 1, element_name,
                         element_id)
            except Exception as err:
                LOG.error('Cannot region-segment %s "%s": %s', element_name,
                          element_id, err)
                region_labels = np.where(line_labels > len(ignore),
                                         1 + len(ignore), line_labels)

            # prepare reading order group index
            if rogroup:
                if isinstance(rogroup,
                              (OrderedGroupType, OrderedGroupIndexedType)):
                    index = 0
                    # start counting from largest existing index
                    for elem in (rogroup.get_RegionRefIndexed() +
                                 rogroup.get_OrderedGroupIndexed() +
                                 rogroup.get_UnorderedGroupIndexed()):
                        if elem.index >= index:
                            index = elem.index + 1
                else:
                    index = None
            # find contours around region labels (can be non-contiguous):
            region_no = 0
            for region_label in np.unique(region_labels):
                if not region_label:
                    continue  # no bg
                region_mask = region_labels == region_label
                region_line_labels = line_labels * region_mask
                region_line_labels0 = np.setdiff1d(region_line_labels, [0])
                if not np.all(region_line_labels0 > len(ignore)):
                    # existing region from `ignore` merely to be ordered
                    # (no new region, no actual text lines)
                    region_line_labels0 = np.intersect1d(
                        region_line_labels0, ignore_labels)
                    assert len(region_line_labels0) == 1, \
                        "region label %d has both existing regions and new lines (%s)" % (
                            region_label, str(region_line_labels0))
                    region = ignore[region_line_labels0[0] - 1]
                    if rogroup and region.parent_object_ == element and not isinstance(
                            region, SeparatorRegionType):
                        index = page_add_to_reading_order(
                            rogroup, region.id, index)
                    LOG.debug('Region label %d is for ignored region "%s"',
                              region_label, region.id)
                    continue
                # normal case: new lines inside new regions
                # remove binary-empty labels, and re-order locally
                order = morph.reading_order(region_line_labels)
                order[np.setdiff1d(region_line_labels0,
                                   element_bin * region_line_labels)] = 0
                region_line_labels = order[region_line_labels]
                # avoid horizontal gaps
                region_line_labels = hmerge_line_seeds(element_bin,
                                                       region_line_labels,
                                                       scale,
                                                       seps=np.maximum(
                                                           sepmask, colseps))
                region_mask |= region_line_labels > 0
                # find contours for region (can be non-contiguous)
                regions, _ = masks2polygons(
                    region_mask * region_label,
                    element_bin,
                    '%s "%s"' % (element_name, element_id),
                    min_area=6000 / zoom / zoom,
                    simplify=ignore_labels * ~(sep_bin))
                # find contours for lines (can be non-contiguous)
                lines, _ = masks2polygons(region_line_labels,
                                          element_bin,
                                          'region "%s"' % element_id,
                                          min_area=640 / zoom / zoom)
                # create new lines in new regions (allocating by intersection)
                line_polys = [Polygon(polygon) for _, polygon in lines]
                for _, region_polygon in regions:
                    region_poly = prep(Polygon(region_polygon))
                    # convert back to absolute (page) coordinates:
                    region_polygon = coordinates_for_segment(
                        region_polygon, image, coords)
                    region_polygon = polygon_for_parent(
                        region_polygon, element)
                    if region_polygon is None:
                        LOG.warning(
                            'Ignoring extant region contour for region label %d',
                            region_label)
                        continue
                    # annotate result:
                    region_no += 1
                    region_id = element_id + "_region%04d" % region_no
                    LOG.debug('Region label %d becomes ID "%s"', region_label,
                              region_id)
                    region = TextRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon)))
                    # find out which line (contours) belong to which region (contours)
                    line_no = 0
                    for i, line_poly in enumerate(line_polys):
                        if not region_poly.intersects(line_poly):  # .contains
                            continue
                        line_label, line_polygon = lines[i]
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, image, coords)
                        line_polygon = polygon_for_parent(line_polygon, region)
                        if line_polygon is None:
                            LOG.warning(
                                'Ignoring extant line contour for region label %d line label %d',
                                region_label, line_label)
                            continue
                        # annotate result:
                        line_no += 1
                        line_id = region_id + "_line%04d" % line_no
                        LOG.debug('Line label %d becomes ID "%s"', line_label,
                                  line_id)
                        line = TextLineType(
                            id=line_id,
                            Coords=CoordsType(
                                points=points_from_polygon(line_polygon)))
                        region.add_TextLine(line)
                    # if the region has received text lines, keep it
                    if region.get_TextLine():
                        element.add_TextRegion(region)
                        LOG.info('Added region "%s" with %d lines for %s "%s"',
                                 region_id, line_no, element_name, element_id)
                        if rogroup:
                            index = page_add_to_reading_order(
                                rogroup, region.id, index)
            # add additional image/non-text regions from compute_segmentation
            # (e.g. drop-capitals or images) ...
            image_labels, num_images = morph.label(images)
            LOG.info('Found %d large non-text/image regions for %s "%s"',
                     num_images, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            image_polygons, _ = masks2polygons(
                image_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for image_label, polygon in image_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning(
                        'Ignoring extant region contour for image label %d',
                        image_label)
                    continue
                region_no += 1
                # annotate result:
                region_id = element_id + "_image%04d" % region_no
                element.add_ImageRegion(
                    ImageRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # split rulers into separator regions:
            hline_labels, num_hlines = morph.label(hlines)
            vline_labels, num_vlines = morph.label(vlines)
            LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines,
                     num_vlines, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            hline_polygons, _ = masks2polygons(
                hline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            vline_polygons, _ = masks2polygons(
                vline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for _, polygon in hline_polygons + vline_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning('Ignoring extant region contour for separator')
                    continue
                # annotate result:
                region_no += 1
                region_id = element_id + "_sep%04d" % region_no
                element.add_SeparatorRegion(
                    SeparatorRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # annotate a text/image-separated image
            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
        else:
            # get mask from region polygon:
            region_polygon = coordinates_of_segment(element, image, coords)
            region_mask = np.zeros_like(element_bin, np.bool)
            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:,
                                                                          0],
                                     region_mask.shape)] = True
            # ensure the new line labels do not extrude from the region:
            line_labels = line_labels * region_mask
            # find contours around labels (can be non-contiguous):
            line_polygons, _ = masks2polygons(line_labels,
                                              element_bin,
                                              'region "%s"' % element_id,
                                              min_area=640 / zoom / zoom)
            line_no = 0
            for line_label, polygon in line_polygons:
                # convert back to absolute (page) coordinates:
                line_polygon = coordinates_for_segment(polygon, image, coords)
                line_polygon = polygon_for_parent(line_polygon, element)
                if line_polygon is None:
                    LOG.warning(
                        'Ignoring extant line contour for line label %d',
                        line_label)
                    continue
                # annotate result:
                line_no += 1
                line_id = element_id + "_line%04d" % line_no
                element.add_TextLine(
                    TextLineType(
                        id=line_id,
                        Coords=CoordsType(
                            points=points_from_polygon(line_polygon))))
            if not sep_bin.any():
                return  # no derived image
            # annotate a text/image-separated image
            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            # update PAGE (reference the image file):
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
Exemple #20
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n):
        # Get image orientation
        # orientation = page.get_orientation() # This function is not working
        #         rotated_image = self.rotate_image(orientation, page_image)
        #         img_array = ocrolib.pil2array(rotated_image)

        img_array = ocrolib.pil2array(page_image)

        # Check if image is RGB or not #FIXME: check not needed anymore?
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)

        img_array_bin = np.array(img_array > ocrolib.midrange(img_array), 'i')

        lineDetectH = []
        lineDetectV = []
        img_array_rr = self.remove_rular(img_array)

        textarea, img_array_rr_ta, height, width = self.detect_textarea(
            img_array_rr)
        self.parameter['colSeparator'] = int(width *
                                             self.parameter['colSeparator'])
        if len(textarea) > 1:
            textarea = self.crop_area(textarea, img_array_bin, img_array_rr_ta)

            if len(textarea) == 0:
                min_x, min_y, max_x, max_y = self.select_borderLine(
                    img_array_rr, lineDetectH, lineDetectV)
            else:
                min_x, min_y, max_x, max_y = textarea[0]
        elif len(textarea) == 1 and (height * width * 0.5 <
                                     (abs(textarea[0][2] - textarea[0][0]) *
                                      abs(textarea[0][3] - textarea[0][1]))):
            x1, y1, x2, y2 = textarea[0]
            x1 = x1 - 20 if x1 > 20 else 0
            x2 = x2 + 20 if x2 < width - 20 else width
            y1 = y1 - 40 if y1 > 40 else 0
            y2 = y2 + 40 if y2 < height - 40 else height

            min_x, min_y, max_x, max_y = textarea[0]
        else:
            min_x, min_y, max_x, max_y = self.select_borderLine(
                img_array_rr, lineDetectH, lineDetectV)

        brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                           (min_x, min_y, max_x, min_y, max_x,
                                            max_y, min_x, max_y)))
        page.set_Border(brd)

        page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y))
        page_xywh['features'] += ',cropped'

        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)

        file_path = self.workspace.save_image_file(page_image,
                                                   file_id,
                                                   page_id=page_id,
                                                   file_grp=self.image_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
    def _process_segment(self, page_image, page, textregion, region_xywh,
                         page_id, input_file, n):
        LOG = getLogger('OcrdAnybaseocrTextline')
        #check for existing text lines and whether to overwrite them
        if textregion.get_TextLine():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextLines in region "%s"', page_id)
                textregion.set_TextLine([])
            else:
                LOG.warning('keeping existing TextLines in region "%s"',
                            page_id)
                return

        binary = ocrolib.pil2array(page_image)

        if len(binary.shape) > 2:
            binary = np.mean(binary, 2)
        binary = np.array(1 - binary / np.amax(binary), 'B')

        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']

        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning(str(scale) + ": bad scale; skipping!\n")
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("too many lines %i; skipping!\n",
                        (np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])

        for i, l in enumerate(lines):
            #LOG.info('check this: ')
            #LOG.info(type(l.bounds))
            #LOG.info(l.bounds)
            #line_points = np.where(l.mask==1)
            #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull
            #x,y = hull.exterior.coords.xy
            #LOG.info('hull coords x: ',x)
            #LOG.info('hull coords y: ',y)

            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                            [min_x, max_y]]

            #line_polygon = [x for x in zip(y, x)]
            line_polygon = coordinates_for_segment(line_polygon, page_image,
                                                   region_xywh)
            line_points = points_from_polygon(line_polygon)

            img = cleaned[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = 255 - img
            img = ocrolib.array2pil(img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                img,
                file_id + "_" + str(n) + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line_id = '%s_line%04d' % (page_id, i)
            line = TextLineType(custom='readingOrder {index:' + str(i) + ';}',
                                id=line_id,
                                Coords=CoordsType(line_points))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)
Exemple #23
0
def test_deskewing(plain_workspace):
    #from ocrd_utils import initLogging, setOverrideLogLevel
    #setOverrideLogLevel('DEBUG')
    size = (3000, 4000)
    poly = [[1403, 2573], [1560, 2573], [1560, 2598], [2311,
                                                       2598], [2311, 2757],
            [2220, 2757], [2220, 2798], [2311, 2798], [2311, 2908],
            [1403, 2908]]
    xywh = xywh_from_polygon(poly)
    bbox = bbox_from_polygon(poly)
    skew = 4.625
    image = Image.new('L', size)
    image = polygon_mask(image, poly)
    #image.show(title='image')
    pixels = np.count_nonzero(np.array(image) > 0)
    name = 'foo0'
    assert plain_workspace.save_image_file(image, name, 'IMG')
    pcgts = page_from_file(next(plain_workspace.mets.find_files(ID=name)))
    page = pcgts.get_Page()
    region = TextRegionType(
        id='nonrect',
        Coords=CoordsType(points=points_from_polygon(poly)),
        orientation=-skew)
    page.add_TextRegion(region)
    page_image, page_coords, _ = plain_workspace.image_from_page(page, '')
    #page_image.show(title='page_image')
    assert list(image.getdata()) == list(page_image.getdata())
    assert np.all(page_coords['transform'] == np.eye(3))
    reg_image, reg_coords = plain_workspace.image_from_segment(
        region, page_image, page_coords, feature_filter='deskewed', fill=0)
    assert list(image.crop(bbox).getdata()) == list(reg_image.getdata())
    assert reg_image.width == xywh['w'] == 908
    assert reg_image.height == xywh['h'] == 335
    assert reg_coords['transform'][0, 2] == -xywh['x']
    assert reg_coords['transform'][1, 2] == -xywh['y']
    # same fg after cropping to minimal bbox
    reg_pixels = np.count_nonzero(np.array(reg_image) > 0)
    assert pixels == reg_pixels
    # now with deskewing (test for size after recropping)
    reg_image, reg_coords = plain_workspace.image_from_segment(region,
                                                               page_image,
                                                               page_coords,
                                                               fill=0)
    #reg_image.show(title='reg_image')
    assert reg_image.width == 932 > xywh['w']
    assert reg_image.height == 382 > xywh['h']
    assert reg_coords['transform'][0, 1] != 0
    assert reg_coords['transform'][1, 0] != 0
    assert 'deskewed' in reg_coords['features']
    # same fg after cropping to minimal bbox (roughly - due to aliasing)
    reg_pixels = np.count_nonzero(np.array(reg_image) > 0)
    assert np.abs(pixels - reg_pixels) / pixels < 0.005
    reg_array = np.array(reg_image) > 0
    # now via AlternativeImage
    path = plain_workspace.save_image_file(reg_image, region.id + '_img',
                                           'IMG')
    region.add_AlternativeImage(
        AlternativeImageType(filename=path, comments=reg_coords['features']))
    logger_capture = FIFOIO(256)
    logger_handler = logging.StreamHandler(logger_capture)
    #logger_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
    logger = logging.getLogger('ocrd_utils.crop_image')
    logger.addHandler(logger_handler)
    reg_image2, reg_coords2 = plain_workspace.image_from_segment(region,
                                                                 page_image,
                                                                 page_coords,
                                                                 fill=0)
    #reg_image2.show(title='reg_image2')
    logger_output = logger_capture.getvalue()
    logger_capture.close()
    assert logger_output == ''
    assert reg_image2.width == reg_image.width
    assert reg_image2.height == reg_image.height
    assert np.allclose(reg_coords2['transform'], reg_coords['transform'])
    assert reg_coords2['features'] == reg_coords['features']
    # same fg after cropping to minimal bbox (roughly - due to aliasing)
    reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0)
    assert reg_pixels2 == reg_pixels
    reg_array2 = np.array(reg_image2) > 0
    assert 0.98 < np.sum(reg_array == reg_array2) / reg_array.size <= 1.0
Exemple #24
0
    def _process_page(self, page, page_image, page_xywh, input_file, zoom=1.0):
        padding = self.parameter['padding']
        img_array = pil2array(page_image)
        # ensure RGB image
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        height, width, _ = img_array.shape
        size = height * width
        # zoom to 300 DPI (larger density: faster; most fixed parameters here expect 300)
        if zoom != 1.0:
            self.logger.info("scaling %dx%d image by %.2f", width, height,
                             zoom)
            img_array = cv2.resize(img_array,
                                   None,
                                   fx=zoom,
                                   fy=zoom,
                                   interpolation=cv2.INTER_CUBIC)

        # detect rule placed in image next to page for scale reference:
        mask_array, mask_box = self.detect_ruler(img_array)
        # detect page frame via line segment detector:
        border_polygon, prefer_border = self.select_borderLine(
            img_array, mask_box)
        border_polygon = np.array(border_polygon) / zoom  # unzoom
        # pad inwards:
        border_polygon = Polygon(border_polygon).buffer(
            -padding).exterior.coords[:-1]
        # get the bounding box from the border polygon:
        # min_x, min_y = border_polygon.min(axis=0)
        # max_x, max_y = border_polygon.max(axis=0)
        # get the inner rectangle from the border polygon:
        # _, min_x, max_x, _ = np.sort(border_polygon[:,0])
        # _, min_y, max_y, _ = np.sort(border_polygon[:,1])
        if prefer_border:
            self.logger.info("Preferring line detector")
        else:
            self.logger.info("Falling back to text detector")
            textboxes = self.detect_textboxes(img_array, mask_array)
            if len(textboxes) > 1:
                textboxes = self.merge_boxes(textboxes, img_array)
            textboxes = np.array(textboxes) / zoom  # unzoom

            if (len(textboxes) == 1 and self.parameter['columnAreaMin'] * size
                    < self.get_area(textboxes[0])):
                self.logger.info("Using text area (%d%% area)",
                                 100 * self.get_area(textboxes[0]) / size)
                min_x, min_y, max_x, max_y = textboxes[0]
                # pad outwards
                border_polygon = polygon_from_bbox(min_x - padding,
                                                   min_y - padding,
                                                   max_x + padding,
                                                   max_y + padding)

        def clip(point):
            x, y = point
            x = max(0, min(page_image.width, x))
            y = max(0, min(page_image.height, y))
            return x, y

        border_polygon = coordinates_for_segment(border_polygon, page_image,
                                                 page_xywh)
        border_polygon = list(map(clip, border_polygon))
        border_points = points_from_polygon(border_polygon)
        border = BorderType(Coords=CoordsType(border_points))
        page.set_Border(border)
        # get clipped relative coordinates for current image
        page_image, page_xywh, _ = self.workspace.image_from_page(
            page, input_file.pageId, fill='background', transparency=True)
        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            page_image,
            file_id + '.IMG-CROP',
            page_id=input_file.pageId,
            file_grp=self.output_file_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Exemple #25
0
    def process(self):
        """
        Binarize with sbb_binarization
        """
        LOG = getLogger('processor.SbbBinarize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']
        model_path = self.resolve_resource(self.parameter['model'])
        binarizer = SbbBinarizer(model_dir=model_path, logger=LOG)

        for n, input_file in enumerate(self.input_files):
            file_id = make_file_id(input_file, self.output_file_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            pcgts.set_pcGtsId(file_id)
            page = pcgts.get_Page()
            page_image, page_xywh, _ = self.workspace.image_from_page(
                page, page_id, feature_filter='binarized')

            if oplevel == 'page':
                LOG.info("Binarizing on 'page' level in page '%s'", page_id)
                bin_image = cv2pil(
                    binarizer.run(image=pil2cv(page_image), use_patches=True))
                # update METS (add the image file):
                bin_image_path = self.workspace.save_image_file(
                    bin_image,
                    file_id + '.IMG-BIN',
                    page_id=input_file.pageId,
                    file_grp=self.output_file_grp)
                page.add_AlternativeImage(
                    AlternativeImageType(filename=bin_image_path,
                                         comments='%s,binarized' %
                                         page_xywh['features']))

            elif oplevel == 'region':
                regions = page.get_AllRegions(['Text', 'Table'], depth=1)
                if not regions:
                    LOG.warning("Page '%s' contains no text/table regions",
                                page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_xywh,
                        feature_filter='binarized')
                    region_image_bin = cv2pil(
                        binarizer.run(image=pil2cv(region_image),
                                      use_patches=True))
                    region_image_bin_path = self.workspace.save_image_file(
                        region_image_bin,
                        "%s_%s.IMG-BIN" % (file_id, region.id),
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    region.add_AlternativeImage(
                        AlternativeImageType(filename=region_image_bin_path,
                                             comments='%s,binarized' %
                                             region_xywh['features']))

            elif oplevel == 'line':
                region_line_tuples = [
                    (r.id, r.get_TextLine())
                    for r in page.get_AllRegions(['Text'], depth=0)
                ]
                if not region_line_tuples:
                    LOG.warning("Page '%s' contains no text lines", page_id)
                for region_id, line in region_line_tuples:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line,
                        page_image,
                        page_xywh,
                        feature_filter='binarized')
                    line_image_bin = cv2pil(
                        binarizer.run(image=pil2cv(line_image),
                                      use_patches=True))
                    line_image_bin_path = self.workspace.save_image_file(
                        line_image_bin,
                        "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id),
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=line_image_bin_path,
                                             comments='%s,binarized' %
                                             line_xywh['features']))

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemple #26
0
    def process(self):
        """Dewarp the lines of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the TextLine level.

        Next, get each line image according to the layout annotation (from
        the alternative image of the line, or by cropping via coordinates
        into the higher-level image), and dewarp it (without resizing).
        Export the result as an image file.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-DEWARP`` along with further
        identification of the input element.

        Reference each new image in the AlternativeImage of the element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId
                             or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            regions = page.get_AllRegions(classes=['Text'])
            if not regions:
                self.logger.warning('Page "%s" contains no text regions',
                                    page_id)
            for region in regions:
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                lines = region.get_TextLine()
                if not lines:
                    self.logger.warning('Region %s contains no text lines',
                                        region.id)
                for line in lines:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line, region_image, region_xywh)

                    self.logger.info(
                        "About to dewarp page '%s' region '%s' line '%s'",
                        page_id, region.id, line.id)
                    try:
                        dew_image = dewarp(
                            line_image,
                            self.lnorm,
                            check=True,
                            max_neighbour=self.parameter['max_neighbour'],
                            zoom=zoom)
                    except InvalidLine as err:
                        self.logger.error('cannot dewarp line "%s": %s',
                                          line.id, err)
                        continue
                    except InadequateLine as err:
                        self.logger.warning('cannot dewarp line "%s": %s',
                                            line.id, err)
                        # as a fallback, simply pad the image vertically
                        # (just as dewarping would do on average, so at least
                        #  this line has similar margins as the others):
                        dew_image = padvert(line_image,
                                            self.parameter['range'])
                    # update METS (add the image file):
                    file_path = self.workspace.save_image_file(
                        dew_image,
                        file_id + '_' + region.id + '_' + line.id +
                        '.IMG-DEWARP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    alternative_image = line.get_AlternativeImage()
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=line_xywh['features'] +
                                             ',dewarped'))

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                             file_id, self.output_file_grp, out.local_filename)
Exemple #27
0
    def process(self):
        """Performs binarization of the region / line with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested level.
        
        Set up Tesseract to recognize the segment image's layout, and get
        the binarized image. Create an image file, and reference it as
        AlternativeImage in the segment element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-BIN`` along with further
        identification of the input element.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrBinarize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']

        with PyTessBaseAPI(path=get_tessdata_path()) as tessapi:
            for n, input_file in enumerate(self.input_files):
                file_id = make_file_id(input_file, self.output_file_grp)
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                page_image, page_xywh, _ = self.workspace.image_from_page(
                    page, page_id)
                LOG.info("Binarizing on '%s' level in page '%s'", oplevel,
                         page_id)

                if oplevel == 'page':
                    tessapi.SetPageSegMode(PSM.AUTO_ONLY)
                    tessapi.SetImage(page_image)
                    page_image_bin = tessapi.GetThresholdedImage()
                    if page_image_bin:
                        # update METS (add the image file):
                        file_path = self.workspace.save_image_file(
                            page_image_bin,
                            file_id + '.IMG-BIN',
                            page_id=page_id,
                            file_grp=self.output_file_grp)
                        # update PAGE (reference the image file):
                        features = page_xywh['features'] + ",binarized"
                        page.add_AlternativeImage(
                            AlternativeImageType(filename=file_path,
                                                 comments=features))
                    else:
                        LOG.error('Cannot binarize %s', "page '%s'" % page_id)
                else:
                    regions = page.get_TextRegion() + page.get_TableRegion()
                    if not regions:
                        LOG.warning("Page '%s' contains no text regions",
                                    page_id)
                    for region in regions:
                        region_image, region_xywh = self.workspace.image_from_segment(
                            region, page_image, page_xywh)
                        if oplevel == 'region':
                            tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
                            self._process_segment(tessapi, RIL.BLOCK, region,
                                                  region_image, region_xywh,
                                                  "region '%s'" % region.id,
                                                  input_file.pageId,
                                                  file_id + '_' + region.id)
                        elif isinstance(region, TextRegionType):
                            lines = region.get_TextLine()
                            if not lines:
                                LOG.warning(
                                    "Page '%s' region '%s' contains no text lines",
                                    page_id, region.id)
                            for line in lines:
                                line_image, line_xywh = self.workspace.image_from_segment(
                                    line, region_image, region_xywh)
                                tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                                self._process_segment(
                                    tessapi, RIL.TEXTLINE, line, line_image,
                                    line_xywh, "line '%s'" % line.id,
                                    input_file.pageId,
                                    file_id + '_' + region.id + '_' + line.id)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
    def process(self):
        """Extract page image and replace original with it.
        
        Open and deserialize PAGE input files and their respective images,
        then go to the page hierarchy level.
        
        Retrieve the image of the (cropped, deskewed, dewarped) page, preferring
        the last annotated form (which, depending on the workflow, could be
        binarized or raw). Add that image file to the workspace with the fileGrp
        USE given in the output fileGrp.
        Reference that file in the page (not as AlternativeImage but) as original
        image. Adjust all segment coordinates accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.ReplaceOriginal')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        feature_selector = self.parameter['feature_selector']
        feature_filter = self.parameter['feature_filter']
        adapt_coords = self.parameter['transform_coordinates']

        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=feature_filter,
                feature_selector=feature_selector)
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            # annotate extracted image
            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                page_image,
                file_id + '-IMG',
                self.output_file_grp,
                page_id=input_file.pageId,
                mimetype='image/png')
            # replace original image
            page.set_imageFilename(file_path)
            # remove all coordinate-sensitive page-level annotations
            page.set_imageWidth(page_image.width)
            page.set_imageHeight(page_image.height)
            page.set_Border(None)  # also removes all derived images
            page.set_orientation(None)
            # also add image as derived image (in order to preserve image features)
            # (but exclude coordinate-sensitive features that have already been applied over the "original")
            features = ','.join(
                filter(
                    lambda f: f not in [
                        "cropped", "deskewed", "rotated-90", "rotated-180",
                        "rotated-270"
                    ], page_coords['features'].split(",")))
            page.add_AlternativeImage(
                AlternativeImageType(filename=file_path, comments=features))
            # adjust all coordinates
            if adapt_coords:
                for region in page.get_AllRegions():
                    region_polygon = coordinates_of_segment(
                        region, page_image, page_coords)
                    region.get_Coords().set_points(
                        points_from_polygon(region_polygon))
                    ensure_valid(region)
                    if isinstance(region, TextRegionType):
                        for line in region.get_TextLine():
                            line_polygon = coordinates_of_segment(
                                line, page_image, page_coords)
                            line.get_Coords().set_points(
                                points_from_polygon(line_polygon))
                            ensure_valid(line)
                            for word in line.get_Word():
                                word_polygon = coordinates_of_segment(
                                    word, page_image, page_coords)
                                word.get_Coords().set_points(
                                    points_from_polygon(word_polygon))
                                ensure_valid(word)
                                for glyph in word.get_Glyph():
                                    glyph_polygon = coordinates_of_segment(
                                        glyph, page_image, page_coords)
                                    glyph.get_Coords().set_points(
                                        points_from_polygon(glyph_polygon))
                                    ensure_valid(glyph)

            # update METS (add the PAGE file):
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=os.path.join(
                                              self.output_file_grp,
                                              file_id + '.xml'),
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Exemple #29
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n):

        raw = ocrolib.pil2array(page_image)
        flat = raw.astype("float64")

        # estimate skew angle and rotate
        if self.parameter['maxskew'] > 0:
            if self.parameter['parallel'] < 2:
                LOG.info("Estimating Skew Angle")
            d0, d1 = flat.shape
            o0, o1 = int(self.parameter['bignore'] * d0), int(
                self.parameter['bignore'] * d1)
            flat = amax(flat) - flat
            flat -= amin(flat)
            est = flat[o0:d0 - o0, o1:d1 - o1]
            ma = self.parameter['maxskew']
            ms = int(2 * self.parameter['maxskew'] *
                     self.parameter['skewsteps'])
            angle = self.estimate_skew_angle(est, linspace(-ma, ma, ms + 1))
            flat = interpolation.rotate(flat,
                                        angle,
                                        mode='constant',
                                        reshape=0)
            flat = amax(flat) - flat
        else:
            angle = 0

        # self.write_angles_to_pageXML(base,angle)
        # estimate low and high thresholds
        if self.parameter['parallel'] < 2:
            LOG.info("Estimating Thresholds")
        d0, d1 = flat.shape
        o0, o1 = int(self.parameter['bignore'] * d0), int(
            self.parameter['bignore'] * d1)
        est = flat[o0:d0 - o0, o1:d1 - o1]
        if self.parameter['escale'] > 0:
            # by default, we use only regions that contain
            # significant variance; this makes the percentile
            # based low and high estimates more reliable
            e = self.parameter['escale']
            v = est - filters.gaussian_filter(est, e * 20.0)
            v = filters.gaussian_filter(v**2, e * 20.0)**0.5
            v = (v > 0.3 * amax(v))
            v = morphology.binary_dilation(v, structure=ones((int(e * 50), 1)))
            v = morphology.binary_dilation(v, structure=ones((1, int(e * 50))))
            if self.parameter['debug'] > 0:
                imshow(v)
                ginput(1, self.parameter['debug'])
            est = est[v]
        lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo'])
        hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi'])

        # rescale the image to get the gray scale image
        if self.parameter['parallel'] < 2:
            LOG.info("Rescaling")
        flat -= lo
        flat /= (hi - lo)
        flat = clip(flat, 0, 1)
        if self.parameter['debug'] > 0:
            imshow(flat, vmin=0, vmax=1)
            ginput(1, self.parameter['debug'])
        deskewed = 1 * (flat > self.parameter['threshold'])

        # output the normalized grayscale and the thresholded images
        #LOG.info("%s lo-hi (%.2f %.2f) angle %4.1f" %(lo, hi, angle))

        #TODO: Need some clarification as the results effect the following pre-processing steps.
        #orientation = -angle
        #orientation = 180 - ((180 - orientation) % 360)

        if angle is None:  # FIXME: quick fix to prevent angle of "none"
            angle = 0

        page.set_orientation(angle)

        page_xywh['features'] += ',deskewed'
        bin_array = array(255 * (deskewed > ocrolib.midrange(deskewed)), 'B')
        page_image = ocrolib.array2pil(bin_array)

        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)
        file_path = self.workspace.save_image_file(page_image,
                                                   file_id,
                                                   page_id=page_id,
                                                   file_grp=self.image_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Exemple #30
0
    def _process_segment(self, page_image, page, region_xywh, page_id,
                         input_file, n):
        binary = ocrolib.pil2array(page_image)
        binary = np.array(1 - binary / np.amax(binary), 'B')
        if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1:
            min_x, max_x = (0, binary.shape[0])
            min_y, max_y = (0, binary.shape[1])
            textregion = TextRegionType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            page.add_TextRegion(textregion)
        else:
            textregion = page.get_TextRegion()[-1]
        ocrolib.write_image_binary("test.bin.png", binary)
        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']
        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale))
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("%s: too many lines %i",
                        (fname, np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])
        region_xywh['features'] += ",textline"
        for i, l in enumerate(lines):
            ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0],
                                                              l.bounds[1]])
            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            img = binary[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = ocrolib.array2pil(img)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line = TextLineType(
                Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                  (min_x, min_y, max_x, min_y, max_x, max_y,
                                   min_x, max_y)))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)