Exemple #1
0
 def sanitize_page(self, page, page_id):
     regions = page.get_TextRegion()
     page_image, page_coords, _ = self.workspace.image_from_page(
         page, page_id)
     for region in regions:
         LOG.info('Sanitizing region "%s"', region.id)
         lines = region.get_TextLine()
         heights = []
         # get labels:
         region_mask = np.zeros((page_image.height, page_image.width), dtype=np.uint8)
         for line in lines:
             line_polygon = coordinates_of_segment(line, page_image, page_coords)
             heights.append(xywh_from_polygon(line_polygon)['h'])
             region_mask[draw.polygon(line_polygon[:, 1],
                                      line_polygon[:, 0],
                                      region_mask.shape)] = 1
             region_mask[draw.polygon_perimeter(line_polygon[:, 1],
                                                line_polygon[:, 0],
                                                region_mask.shape)] = 1
         # estimate scale:
         scale = int(np.median(np.array(heights)))
         # close labels:
         region_mask = np.pad(region_mask, scale) # protect edges
         region_mask = filters.maximum_filter(region_mask, (scale, 1), origin=0)
         region_mask = filters.minimum_filter(region_mask, (scale, 1), origin=0)
         region_mask = region_mask[scale:-scale, scale:-scale] # unprotect
         # find outer contour (parts):
         contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
         # determine areas of parts:
         areas = [cv2.contourArea(contour) for contour in contours]
         total_area = sum(areas)
         if not total_area:
             # ignore if too small
             LOG.warning('Zero contour area in region "%s"', region.id)
             continue
         # pick contour and convert to absolute:
         region_polygon = None
         for i, contour in enumerate(contours):
             area = areas[i]
             if area / total_area < 0.1:
                 LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"',
                             i, area, total_area, region.id)
                 continue
             # simplify shape:
             polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
             if len(polygon) < 4:
                 LOG.warning('Ignoring contour %d less than 4 points in region "%s"',
                             i, region.id)
                 continue
             if region_polygon is not None:
                 LOG.error('Skipping region "%s" due to non-contiguous contours',
                           region.id)
                 region_polygon = None
                 break
             region_polygon = coordinates_for_segment(polygon, page_image, page_coords)
         if region_polygon is not None:
             LOG.info('Using new coordinates for region "%s"', region.id)
             region.get_Coords().points = points_from_polygon(region_polygon)
Exemple #2
0
    def _set_Border(self, page, page_image, page_xywh, border_polygon):
        # Convert to absolute (page) coordinates:
        border_polygon = coordinates_for_segment(border_polygon, page_image,
                                                 page_xywh)

        # Save border:
        page.set_Border(
            BorderType(Coords=CoordsType(
                points=points_from_polygon(border_polygon))))
 def _process_region(self, it, region, rogroup, region_image,
                     region_coords):
     LOG = getLogger('processor.TesserocrSegmentTable')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     index = 0
     if rogroup:
         for elem in (rogroup.get_RegionRefIndexed() +
                      rogroup.get_OrderedGroupIndexed() +
                      rogroup.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     while it and not it.Empty(RIL.BLOCK):
         bbox = it.BoundingBox(RIL.BLOCK)
         polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, region_image,
                                           region_coords)
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (but ignore non-text regions entirely)
         ID = region.id + "_%04d" % index
         subregion = TextRegionType(id=ID,
                                    Coords=coords,
                                    type=TextTypeSimpleType.PARAGRAPH)
         block_type = it.BlockType()
         if block_type == PT.FLOWING_TEXT:
             pass
         elif block_type == PT.HEADING_TEXT:
             subregion.set_type(TextTypeSimpleType.HEADING)
         elif block_type == PT.PULLOUT_TEXT:
             subregion.set_type(TextTypeSimpleType.FLOATING)
         elif block_type == PT.CAPTION_TEXT:
             subregion.set_type(TextTypeSimpleType.CAPTION)
         elif block_type == PT.VERTICAL_TEXT:
             subregion.set_orientation(90.0)
         else:
             it.Next(RIL.BLOCK)
             continue
         LOG.info("Detected cell '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         region.add_TextRegion(subregion)
         if rogroup:
             rogroup.add_RegionRefIndexed(
                 RegionRefIndexedType(regionRef=ID, index=index))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
    def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n):
        img_array = ocrolib.pil2array(page_image)

        # Check if image is RGB or not #FIXME: check not needed anymore?
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array,)*3, axis=-1)

        img_array_bin = np.array(
            img_array > ocrolib.midrange(img_array), 'i')

        lineDetectH = []
        lineDetectV = []
        img_array_rr = self.remove_rular(img_array)

        textarea, img_array_rr_ta, height, width = self.detect_textarea(
            img_array_rr)
        colSeparator = int(
            width * self.parameter['colSeparator'])
        if len(textarea) > 1:
            textarea = self.crop_area(
                textarea, img_array_bin, img_array_rr_ta, colSeparator)

            if len(textarea) == 0:
                min_x, min_y, max_x, max_y = self.select_borderLine(
                    img_array_rr, lineDetectH, lineDetectV)
            else:
                min_x, min_y, max_x, max_y = textarea[0]
        elif len(textarea) == 1 and (height*width*0.5 < (abs(textarea[0][2]-textarea[0][0]) * abs(textarea[0][3]-textarea[0][1]))):
            x1, y1, x2, y2 = textarea[0]
            x1 = x1-20 if x1 > 20 else 0
            x2 = x2+20 if x2 < width-20 else width
            y1 = y1-40 if y1 > 40 else 0
            y2 = y2+40 if y2 < height-40 else height

            min_x, min_y, max_x, max_y = textarea[0]
        else:
            min_x, min_y, max_x, max_y = self.select_borderLine(
                img_array_rr, lineDetectH, lineDetectV)

        border_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]
        border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh)
        border_points = points_from_polygon(border_polygon)
        brd = BorderType(Coords=CoordsType(border_points))
        page.set_Border(brd)

        page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y))
        page_xywh['features'] += ',cropped'

        file_id = make_file_id(input_file, self.output_file_grp)

        file_path = self.workspace.save_image_file(page_image,
                                                   file_id + '-IMG',
                                                   page_id=page_id,
                                                   file_grp=self.output_file_grp)
        page.add_AlternativeImage(AlternativeImageType(
            filename=file_path, comments=page_xywh['features']))
Exemple #5
0
 def _process_words_in_line(self, result_it, line, line_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.WORD):
         LOG.warning("No text in line '%s'", line.id)
         return
     # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD):
     word_no = 0
     while result_it and not result_it.Empty(RIL.WORD):
         word_id = '%s_word%04d' % (line.id, word_no)
         LOG.debug("Decoding text in word '%s'", word_id)
         bbox = result_it.BoundingBox(RIL.WORD)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, line_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, line)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         word = WordType(id=word_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant word: %s', points)
         else:
             line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=word_attributes['bold']
                 if 'bold' in word_attributes else None,
                 italic=word_attributes['italic']
                 if 'italic' in word_attributes else None,
                 underlined=word_attributes['underlined']
                 if 'underlined' in word_attributes else None,
                 monospace=word_attributes['monospace']
                 if 'monospace' in word_attributes else None,
                 serif=word_attributes['serif']
                 if 'serif' in word_attributes else None)
             word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(TextEquivType(
             Unicode=result_it.GetUTF8Text(RIL.WORD),
             conf=result_it.Confidence(RIL.WORD)/100))
         if self.parameter['textequiv_level'] != 'word':
             self._process_glyphs_in_word(result_it, word, line_xywh)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             word_no += 1
             result_it.Next(RIL.WORD)
Exemple #6
0
    def _add_TextRegion(self, page, page_image, page_xywh, page_id,
                        region_polygon, region_id):
        # Convert to absolute (page) coordinates:
        region_polygon = coordinates_for_segment(region_polygon, page_image,
                                                 page_xywh)

        # Save text region:
        page.add_TextRegion(
            TextRegionType(
                id=page_id + region_id,
                Coords=CoordsType(points=points_from_polygon(region_polygon))))
Exemple #7
0
    def _add_TextLine(self, page_id, region, region_image, region_xywh,
                      region_id, line_polygon, line_id):
        # Convert to absolute (page) coordinates:
        line_polygon = coordinates_for_segment(line_polygon, region_image,
                                               region_xywh)

        # Save text line:
        region.add_TextLine(
            TextLineType(
                id=page_id + region_id + line_id,
                Coords=CoordsType(points=points_from_polygon(line_polygon))))
Exemple #8
0
def adapt_coords(segment, parent, transform):
    points = segment.get_Coords().get_points()
    polygon = polygon_from_points(points)
    # polygon absolute coords (after transforming back from page coords, e.g. deskewing)
    polygon_new = coordinates_for_segment(polygon, None, transform)
    # intersection with parent polygon
    polygon_new = polygon_for_parent(polygon_new, parent)
    if polygon_new is None:
        return None
    points_new = points_from_polygon(polygon_new)
    segment.set_Coords(CoordsType(points=points_new))
    return segment
        def add_region(region: RectSegment, index: int, region_type: str):
            from ocrd_utils import coordinates_for_segment, points_from_polygon
            polygon = polygon_from_segment(region)
            polygon = coordinates_for_segment(polygon, page_image, page_coords)
            points = points_from_polygon(polygon)

            indexed_id = "region%04d" % index
            coords = CoordsType(points=points)
            if region_type == "text":
                page.add_TextRegion(
                    TextRegionType(id=indexed_id, Coords=coords))
            elif region_type == "image":
                page.add_ImageRegion(
                    ImageRegionType(id=indexed_id, Coords=coords))
            else:
                page.add_NoiseRegion(
                    NoiseRegionType(id=indexed_id, Coords=coords))
Exemple #10
0
 def _process_glyphs_in_word(self, result_it, word, word_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.SYMBOL):
         LOG.debug("No glyph in word '%s'", word.id)
         return
     # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
     glyph_no = 0
     while result_it and not result_it.Empty(RIL.SYMBOL):
         glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
         LOG.debug("Decoding text in glyph '%s'", glyph_id)
         #  glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice?
         glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice?
         #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         bbox = result_it.BoundingBox(RIL.SYMBOL)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, word_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, word)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         glyph = GlyphType(id=glyph_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant glyph: %s', points)
         else:
             word.add_Glyph(glyph)
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence()/100
             #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or
                 choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf))
         if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
         else:
             glyph_no += 1
             result_it.Next(RIL.SYMBOL)
 def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id):
     """Set the identified page border, if valid."""
     LOG = getLogger('processor.TesserocrCrop')
     left, top, right, bottom = bounds
     if left >= right or top >= bottom:
         LOG.error("Cannot find valid extent for page '%s'", page_id)
         return
     padding = self.parameter['padding']
     # add padding:
     left = max(left - padding, 0)
     right = min(right + padding, page_image.width)
     top = max(top - padding, 0)
     bottom = min(bottom + padding, page_image.height)
     LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom)
     polygon = polygon_from_bbox(left, top, right, bottom)
     polygon = coordinates_for_segment(polygon, page_image, page_xywh)
     polygon = polygon_for_parent(polygon, page)
     if polygon is None:
         LOG.error("Ignoring extant border")
         return
     border = BorderType(Coords=CoordsType(
         points_from_polygon(polygon)))
     # intersection with parent could have changed bbox,
     # so recalculate:
     bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh))
     # update PAGE (annotate border):
     page.set_Border(border)
     # update METS (add the image file):
     page_image = crop_image(page_image, box=bbox)
     page_xywh['features'] += ',cropped'
     file_path = self.workspace.save_image_file(
         page_image, file_id + '.IMG-CROP',
         page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     page.add_AlternativeImage(AlternativeImageType(
         filename=file_path, comments=page_xywh['features']))
Exemple #12
0
    def process_lines(self, textlines, maxlevel, region_image, region_coords):
        edits = 0
        lengs = 0
        for line in textlines:
            line_image, line_coords = self.workspace.image_from_segment(
                line, region_image, region_coords)

            self.logger.info("Recognizing text in line '%s'", line.id)
            if line.get_TextEquiv():
                linegt = line.TextEquiv[0].Unicode
            else:
                linegt = ''
            self.logger.debug("GT  '%s': '%s'", line.id, linegt)
            # remove existing annotation below line level:
            line.set_TextEquiv([])
            line.set_Word([])

            if line_image.size[1] < 16:
                self.logger.debug(
                    "ERROR: bounding box is too narrow at line %s", line.id)
                continue
            # resize image to 48 pixel height
            final_img, scale = resize_keep_ratio(line_image)

            # process ocropy:
            try:
                linepred, clist, rlist, confidlist = recognize(final_img,
                                                               self.pad,
                                                               self.network,
                                                               check=True)
            except Exception as err:
                self.logger.debug('error processing line "%s": %s', line.id,
                                  err)
                continue
            self.logger.debug("OCR '%s': '%s'", line.id, linepred)
            edits += Levenshtein.distance(linepred, linegt)
            lengs += len(linegt)

            words = [x.strip() for x in linepred.split(' ') if x.strip()]

            word_r_list = [[0]]  # r-positions of every glyph in every word
            word_conf_list = [[]]  # confidences of every glyph in every word
            if words != []:
                w_no = 0
                found_char = False
                for i, c in enumerate(clist):
                    if c != ' ':
                        found_char = True
                        word_conf_list[w_no].append(confidlist[i])
                        word_r_list[w_no].append(rlist[i])

                    if c == ' ' and found_char:
                        if i == 0:
                            word_r_list[0][0] = rlist[i]

                        elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ':
                            word_conf_list.append([])
                            word_r_list.append([rlist[i]])
                            w_no += 1
            else:
                word_conf_list = [[0]]
                word_r_list = [[0, line_image.width]]

            # conf for each word
            wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list]
            # conf for the line
            line_conf = (min(wordsconf) + max(wordsconf)) / 2
            # line text
            line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))

            if maxlevel in ['word', 'glyph']:
                for word_no, word_str in enumerate(words):
                    word_points = points_from_polygon(
                        coordinates_for_segment(
                            np.array(
                                polygon_from_bbox(
                                    word_r_list[word_no][0] / scale, 0,
                                    word_r_list[word_no][-1] / scale,
                                    0 + line_image.height)), line_image,
                            line_coords))
                    word_id = '%s_word%04d' % (line.id, word_no)
                    word = WordType(id=word_id, Coords=CoordsType(word_points))
                    line.add_Word(word)
                    word.add_TextEquiv(
                        TextEquivType(Unicode=word_str,
                                      conf=wordsconf[word_no]))

                    if maxlevel == 'glyph':
                        for glyph_no, glyph_str in enumerate(word_str):
                            glyph_points = points_from_polygon(
                                coordinates_for_segment(
                                    np.array(
                                        polygon_from_bbox(
                                            word_r_list[word_no][glyph_no] /
                                            scale, 0,
                                            word_r_list[word_no][glyph_no + 1]
                                            / scale, 0 + line_image.height)),
                                    line_image, line_coords))
                            glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
                            glyph = GlyphType(id=glyph_id,
                                              Coords=CoordsType(glyph_points))
                            word.add_Glyph(glyph)
                            glyph.add_TextEquiv(
                                TextEquivType(
                                    Unicode=glyph_str,
                                    conf=word_conf_list[word_no][glyph_no]))
        return edits, lengs
Exemple #13
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
    def process(self):
        """Performs word segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the textline level,
        and remove any existing Word elements (unless ``overwrite_words``
        is False).
        
        Set up Tesseract to detect words, and add each one to the line
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_words = self.parameter['overwrite_words']

        with PyTessBaseAPI(
            psm=PSM.SINGLE_LINE,
            path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                page = pcgts.get_Page()
                
                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata() # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(type_="processingStep",
                                     name=self.ocrd_tool['steps'][0],
                                     value=TOOL,
                                     Labels=[LabelsType(
                                         externalModel="ocrd-tool",
                                         externalId="parameters",
                                         Label=[LabelType(type_=name,
                                                          value=self.parameter[name])
                                                for name in self.parameter.keys()])]))
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in page.get_TextRegion():
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    for line in region.get_TextLine():
                        if line.get_Word():
                            if overwrite_words:
                                LOG.info('removing existing Words in line "%s"', line.id)
                                line.set_Word([])
                            else:
                                LOG.warning('keeping existing Words in line "%s"', line.id)
                        LOG.debug("Detecting words in line '%s'", line.id)
                        line_image, line_coords = self.workspace.image_from_segment(
                            line, region_image, region_coords)
                        tessapi.SetImage(line_image)
                        for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)):
                            word_id = '%s_word%04d' % (line.id, word_no)
                            word_polygon = polygon_from_xywh(component[1])
                            word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords)
                            word_points = points_from_polygon(word_polygon)
                            line.add_Word(WordType(
                                id=word_id, Coords=CoordsType(word_points)))
                            
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
    def _process_segment(self, page_image, page, textregion, region_xywh,
                         page_id, input_file, n):
        LOG = getLogger('OcrdAnybaseocrTextline')
        #check for existing text lines and whether to overwrite them
        if textregion.get_TextLine():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextLines in region "%s"', page_id)
                textregion.set_TextLine([])
            else:
                LOG.warning('keeping existing TextLines in region "%s"',
                            page_id)
                return

        binary = ocrolib.pil2array(page_image)

        if len(binary.shape) > 2:
            binary = np.mean(binary, 2)
        binary = np.array(1 - binary / np.amax(binary), 'B')

        if self.parameter['scale'] == 0:
            scale = psegutils.estimate_scale(binary)
        else:
            scale = self.parameter['scale']

        if np.isnan(
                scale) or scale > 1000.0 or scale < self.parameter['minscale']:
            LOG.warning(str(scale) + ": bad scale; skipping!\n")
            return

        segmentation = self.compute_segmentation(binary, scale)
        if np.amax(segmentation) > self.parameter['maxlines']:
            LOG.warning("too many lines %i; skipping!\n",
                        (np.amax(segmentation)))
            return
        lines = psegutils.compute_lines(segmentation, scale)
        order = psegutils.reading_order([l.bounds for l in lines])
        lsort = psegutils.topsort(order)

        # renumber the labels so that they conform to the specs

        nlabels = np.amax(segmentation) + 1
        renumber = np.zeros(nlabels, 'i')
        for i, v in enumerate(lsort):
            renumber[lines[v].label] = 0x010000 + (i + 1)
        segmentation = renumber[segmentation]

        lines = [lines[i] for i in lsort]
        cleaned = ocrolib.remove_noise(binary, self.parameter['noise'])

        for i, l in enumerate(lines):
            #LOG.info('check this: ')
            #LOG.info(type(l.bounds))
            #LOG.info(l.bounds)
            #line_points = np.where(l.mask==1)
            #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull
            #x,y = hull.exterior.coords.xy
            #LOG.info('hull coords x: ',x)
            #LOG.info('hull coords y: ',y)

            min_x, max_x = (l.bounds[0].start, l.bounds[0].stop)
            min_y, max_y = (l.bounds[1].start, l.bounds[1].stop)

            line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                            [min_x, max_y]]

            #line_polygon = [x for x in zip(y, x)]
            line_polygon = coordinates_for_segment(line_polygon, page_image,
                                                   region_xywh)
            line_points = points_from_polygon(line_polygon)

            img = cleaned[l.bounds[0], l.bounds[1]]
            img = np.array(255 * (img > ocrolib.midrange(img)), 'B')
            img = 255 - img
            img = ocrolib.array2pil(img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                img,
                file_id + "_" + str(n) + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)
            ai = AlternativeImageType(filename=file_path,
                                      comments=region_xywh['features'])
            line_id = '%s_line%04d' % (page_id, i)
            line = TextLineType(custom='readingOrder {index:' + str(i) + ';}',
                                id=line_id,
                                Coords=CoordsType(line_points))
            line.add_AlternativeImage(ai)
            textregion.add_TextLine(line)
Exemple #16
0
 def _process_segment(self, parent, parent_image, parent_coords, page_id,
                      zoom, lines, ignore):
     LOG = getLogger('processor.OcropyResegment')
     threshold = self.parameter['min_fraction']
     margin = self.parameter['extend_margins']
     method = self.parameter['method']
     # prepare line segmentation
     parent_array = pil2array(parent_image)
     #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw
     parent_bin = np.array(parent_array <= midrange(parent_array), np.bool)
     ignore_bin = np.ones_like(parent_bin, np.bool)
     if isinstance(parent, PageType):
         tag = 'page'
         fullpage = True
         report = check_page(parent_bin, zoom)
     else:
         tag = 'region'
         fullpage = False
         report = check_region(parent_bin, zoom)
     if report:
         LOG.warning('Invalid %s "%s": %s', tag,
                     page_id if fullpage else parent.id, report)
         return
     # get existing line labels:
     line_labels = np.zeros_like(parent_bin, np.bool)
     line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1))
     line_polygons = []
     for i, segment in enumerate(lines):
         segment_polygon = coordinates_of_segment(segment, parent_image,
                                                  parent_coords)
         segment_polygon = make_valid(
             Polygon(segment_polygon)).buffer(margin)
         line_polygons.append(prep(segment_polygon))
         segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1]
         # draw.polygon: If any segment_polygon lies outside of parent
         # (causing negative/above-max indices), either fully or partially,
         # then this will silently ignore them. The caller does not need
         # to concern herself with this.
         segment_y, segment_x = draw.polygon(segment_polygon[:, 1],
                                             segment_polygon[:, 0],
                                             parent_bin.shape)
         line_labels[i, segment_y, segment_x] = True
     # only text region(s) may contain new text lines
     for i, segment in enumerate(set(line.parent_object_
                                     for line in lines)):
         LOG.debug('unmasking area of text region "%s" for "%s"',
                   segment.id, page_id if fullpage else parent.id)
         segment_polygon = coordinates_of_segment(segment, parent_image,
                                                  parent_coords)
         segment_polygon = make_valid(
             Polygon(segment_polygon)).buffer(margin)
         segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1]
         ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                        0],
                                 parent_bin.shape)] = False
     # mask/ignore overlapping neighbours
     for i, segment in enumerate(ignore):
         LOG.debug('masking area of %s "%s" for "%s"',
                   type(segment).__name__[:-4], segment.id,
                   page_id if fullpage else parent.id)
         segment_polygon = coordinates_of_segment(segment, parent_image,
                                                  parent_coords)
         ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                        0],
                                 parent_bin.shape)] = True
     if method != 'lineest':
         LOG.debug(
             'calculating connected component and distance transforms for "%s"',
             parent.id)
         bin = parent_bin & ~ignore_bin
         components, _ = morph.label(bin)
         # estimate glyph scale (roughly)
         _, counts = np.unique(components, return_counts=True)
         if counts.shape[0] > 1:
             counts = np.sqrt(3 * counts)
             scale = int(
                 np.median(counts[(5 / zoom < counts)
                                  & (counts < 100 / zoom)]))
             components *= (counts > 15 / zoom)[components]
             LOG.debug("estimated scale: %d", scale)
         else:
             scale = 43
         if method == 'ccomps':
             labels = np.insert(line_labels, 0, ignore_bin, axis=0)
             distances = np.zeros_like(labels, np.uint8)
             for i, label in enumerate(labels):
                 distances[i] = morph.dist_labels(label.astype(np.uint8))
                 # normalize the distances of all lines so larger ones do not displace smaller ones
                 distances[i] = distances[i] / distances[i].max() * 255
             # use depth to flatten overlapping lines as seed labels
             new_labels = np.argmax(distances, axis=0)
         else:
             new_labels = np.zeros_like(parent_bin, np.uint8)
             for i, line in enumerate(lines):
                 if line.Baseline is None:
                     LOG.warning("Skipping '%s' without baseline", line.id)
                     new_labels[line_labels[i]] = i + 1
                     continue
                 line_polygon = baseline_of_segment(line, parent_coords)
                 line_ltr = line_polygon[0, 0] < line_polygon[-1, 0]
                 line_polygon = make_valid(
                     join_polygons(
                         LineString(line_polygon).buffer(
                             # left-hand side if left-to-right, and vice versa
                             scale * (-1)**line_ltr,
                             single_sided=True),
                         loc=line.id))
                 line_polygon = np.array(line_polygon.exterior, np.int)[:-1]
                 line_y, line_x = draw.polygon(line_polygon[:, 1],
                                               line_polygon[:, 0],
                                               parent_bin.shape)
                 new_labels[line_y, line_x] = i + 1
         spread_dist(lines,
                     line_labels,
                     new_labels,
                     parent_bin,
                     components,
                     parent_coords,
                     scale=scale,
                     loc=parent.id,
                     threshold=threshold)
         return
     try:
         new_line_labels, _, _, _, _, scale = compute_segmentation(
             parent_bin,
             seps=ignore_bin,
             zoom=zoom,
             fullpage=fullpage,
             maxseps=0,
             maxcolseps=len(ignore),
             maximages=0)
     except Exception as err:
         LOG.warning('Cannot line-segment %s "%s": %s', tag,
                     page_id if fullpage else parent.id, err)
         return
     LOG.info("Found %d new line labels for %d existing lines on %s '%s'",
              new_line_labels.max(), len(lines), tag, parent.id)
     # polygonalize and prepare comparison
     new_line_polygons, new_line_labels = masks2polygons(
         new_line_labels,
         parent_bin,
         '%s "%s"' % (tag, parent.id),
         min_area=640 / zoom / zoom)
     # DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
     # DSAVE('new_line_labels', [new_line_labels, parent_bin], disabled=False)
     new_line_polygons = [
         make_valid(Polygon(line_poly))
         for line_label, line_poly in new_line_polygons
     ]
     # polygons for intersecting pairs
     intersections = dict()
     # ratio of overlap between intersection and new line
     fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)),
                        np.float)
     fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)),
                        np.float)
     # ratio of overlap between intersection and existing line
     covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)),
                          np.float)
     covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)),
                          np.float)
     # compare segmentations, calculating ratios of overlapping fore/background area
     for i, new_line_poly in enumerate(new_line_polygons):
         for j, line_poly in enumerate(line_polygons):
             # too strict: .contains
             if line_poly.intersects(new_line_poly):
                 inter = make_intersection(line_poly.context, new_line_poly)
                 if not inter:
                     continue
                 new_line_mask = (new_line_labels == i + 1) & parent_bin
                 line_mask = line_labels[j] & parent_bin
                 inter_mask = new_line_mask & line_mask
                 if (not np.count_nonzero(inter_mask)
                         or not np.count_nonzero(new_line_mask)
                         or not np.count_nonzero(line_mask)):
                     continue
                 intersections[(i, j)] = inter
                 fits_bg[i, j] = inter.area / new_line_poly.area
                 covers_bg[i, j] = inter.area / line_poly.context.area
                 fits_fg[i, j] = np.count_nonzero(
                     inter_mask) / np.count_nonzero(new_line_mask)
                 covers_fg[i, j] = np.count_nonzero(
                     inter_mask) / np.count_nonzero(line_mask)
                 # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg",
                 #           i, j, lines[j].id,
                 #           fits_bg[i,j]*100, covers_bg[i,j]*100,
                 #           fits_fg[i,j]*100, covers_fg[i,j]*100)
     # assign new lines to existing lines, if possible
     assignments = np.ones(len(new_line_polygons), np.int) * -1
     for i, new_line_poly in enumerate(new_line_polygons):
         if not fits_bg[i].any():
             LOG.debug("new line %d fits no existing line's background", i)
             continue
         if not fits_fg[i].any():
             LOG.debug("new line %d fits no existing line's foreground", i)
             continue
         fits = (fits_bg[i] > 0.6) & (fits_fg[i] > 0.9)
         if not fits.any():
             j = np.argmax(fits_bg[i] * fits_fg[i])
             LOG.debug(
                 "best fit '%s' for new line %d fits only %.1f%% bg / %.1f%% fg",
                 lines[j].id, i, fits_bg[i, j] * 100, fits_fg[i, j] * 100)
             continue
         covers = covers_bg[i] * covers_fg[i] * fits
         j = np.argmax(covers)
         line = lines[j]
         inter_polygon = intersections[(i, j)]
         new_line_polygon = new_line_polygons[i]
         new_center = inter_polygon.centroid
         center = new_line_polygon.centroid
         # FIXME: apply reasonable threshold for centroid distance
         LOG.debug("new line for '%s' has centroid distance %.2f", line.id,
                   center.distance(new_center))
         assignments[i] = j
     # validate assignments retain enough area and do not loose unassigned matches
     line_polygons = [
         poly.context.buffer(-margin) for poly in line_polygons
     ]
     for j, line in enumerate(lines):
         new_lines = np.nonzero(assignments == j)[0]
         if not np.prod(new_lines.shape):
             LOG.debug("no lines for '%s' match or fit", line.id)
             continue
         covers = np.sum(covers_bg[new_lines, j])
         if covers < threshold / 3:
             LOG.debug("new lines for '%s' only cover %.1f%% bg", line.id,
                       covers * 100)
             continue
         covers = np.sum(covers_fg[new_lines, j])
         if covers < threshold:
             LOG.debug("new lines for '%s' only cover %.1f%% fg", line.id,
                       covers * 100)
             continue
         looses = (assignments < 0) & (covers_bg[:, j] > 0.1)
         if looses.any():
             covers = np.sum(covers_bg[np.nonzero(looses)[0], j])
             LOG.debug(
                 "new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
                 line.id, np.count_nonzero(looses), covers * 100)
             continue
         line_count = np.count_nonzero(line_labels[j] & parent_bin)
         new_count = covers * line_count
         LOG.debug(
             'Black pixels before/after resegment of line "%s": %d/%d',
             line.id, line_count, new_count)
         # combine all assigned new lines to single outline polygon
         if len(new_lines) > 1:
             LOG.debug("joining %d new line polygons for '%s'",
                       len(new_lines), line.id)
         new_polygon = join_polygons(
             [intersections[(i, j)] for i in new_lines], loc=line.id)
         line_polygons[j] = new_polygon
         # convert back to absolute (page) coordinates:
         line_polygon = coordinates_for_segment(
             new_polygon.exterior.coords[:-1], parent_image, parent_coords)
         line_polygon = polygon_for_parent(line_polygon,
                                           line.parent_object_)
         if line_polygon is None:
             LOG.warning("Ignoring extant new polygon for line '%s'",
                         line.id)
             return
         # annotate result:
         line.get_Coords().set_points(points_from_polygon(line_polygon))
         # now also ensure the assigned lines do not overlap other existing lines
         for i in new_lines:
             for otherj in np.nonzero(fits_fg[i] > 0.1)[0]:
                 if j == otherj:
                     continue
                 otherline = lines[otherj]
                 LOG.debug("subtracting new '%s' from overlapping '%s'",
                           line.id, otherline.id)
                 other_polygon = diff_polygons(line_polygons[otherj],
                                               new_polygon)
                 if other_polygon.is_empty:
                     continue
                 # convert back to absolute (page) coordinates:
                 other_polygon = coordinates_for_segment(
                     other_polygon.exterior.coords[:-1], parent_image,
                     parent_coords)
                 other_polygon = polygon_for_parent(
                     other_polygon, otherline.parent_object_)
                 if other_polygon is None:
                     LOG.warning(
                         "Ignoring extant new polygon for line '%s'",
                         otherline.id)
                     continue
                 otherline.get_Coords().set_points(
                     points_from_polygon(other_polygon))
Exemple #17
0
    def _process_element(self,
                         element,
                         ignore,
                         image,
                         coords,
                         element_id,
                         file_id,
                         page_id,
                         zoom=1.0,
                         rogroup=None):
        """Add PAGE layout elements by segmenting an image.

        Given a PageType, TableRegionType or TextRegionType ``element``, and
        a corresponding binarized PIL.Image object ``image`` with coordinate
        metadata ``coords``, run line segmentation with Ocropy.
        
        If operating on the full page (or table), then also detect horizontal
        and vertical separators, and aggregate the lines into text regions
        afterwards.
        
        Add the resulting sub-segments to the parent ``element``.
        
        If ``ignore`` is not empty, then first suppress all foreground components
        in any of those segments' coordinates during segmentation, and if also
        in full page/table mode, then combine all separators among them with the
        newly detected separators to guide region segmentation.
        """
        LOG = getLogger('processor.OcropySegment')
        if not image.width or not image.height:
            LOG.warning("Skipping '%s' with zero size", element_id)
            return
        element_array = pil2array(image)
        element_bin = np.array(element_array <= midrange(element_array),
                               np.bool)
        sep_bin = np.zeros_like(element_bin, np.bool)
        ignore_labels = np.zeros_like(element_bin, np.int)
        for i, segment in enumerate(ignore):
            LOG.debug('masking foreground of %s "%s" for "%s"',
                      type(segment).__name__[:-4], segment.id, element_id)
            # mark these segments (e.g. separator regions, tables, images)
            # for workflows where they have been detected already;
            # these will be:
            # - ignored during text line segmentation (but not h/v-line detection)
            # - kept and reading-ordered during region segmentation (but not seps)
            segment_polygon = coordinates_of_segment(segment, image, coords)
            # If segment_polygon lies outside of element (causing
            # negative/above-max indices), either fully or partially,
            # then this will silently ignore them. The caller does
            # not need to concern herself with this.
            if isinstance(segment, SeparatorRegionType):
                sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                            0],
                                     sep_bin.shape)] = True
            ignore_labels[draw.polygon(
                segment_polygon[:, 1], segment_polygon[:, 0],
                ignore_labels.shape)] = i + 1  # mapped back for RO
        if isinstance(element, PageType):
            element_name = 'page'
            fullpage = True
            report = check_page(element_bin, zoom)
        elif isinstance(element, TableRegionType) or (
                # sole/congruent text region of a table region?
                element.id.endswith('_text')
                and isinstance(element.parent_object_, TableRegionType)):
            element_name = 'table'
            fullpage = True
            report = check_region(element_bin, zoom)
        else:
            element_name = 'region'
            fullpage = False
            report = check_region(element_bin, zoom)
        LOG.info('computing line segmentation for %s "%s"', element_name,
                 element_id)
        # TODO: we should downscale if DPI is large enough to save time
        try:
            if report:
                raise Exception(report)
            line_labels, hlines, vlines, images, colseps, scale = compute_segmentation(
                # suppress separators and ignored regions for textline estimation
                # but keep them for h/v-line detection (in fullpage mode):
                element_bin,
                seps=(sep_bin + ignore_labels) > 0,
                zoom=zoom,
                fullpage=fullpage,
                spread_dist=round(self.parameter['spread'] / zoom * 300 /
                                  72),  # in pt
                # these are ignored when not in fullpage mode:
                maxcolseps=self.parameter['maxcolseps'],
                maxseps=self.parameter['maxseps'],
                maximages=self.parameter['maximages']
                if element_name != 'table' else 0,
                csminheight=self.parameter['csminheight'],
                hlminwidth=self.parameter['hlminwidth'])
        except Exception as err:
            if isinstance(element, TextRegionType):
                LOG.error('Cannot line-segment region "%s": %s', element_id,
                          err)
                # as a fallback, add a single text line comprising the whole region:
                element.add_TextLine(
                    TextLineType(id=element_id + "_line",
                                 Coords=element.get_Coords()))
            else:
                LOG.error('Cannot line-segment %s "%s": %s', element_name,
                          element_id, err)
            return

        LOG.info('Found %d text lines for %s "%s"',
                 len(np.unique(line_labels)) - 1, element_name, element_id)
        # post-process line labels
        if isinstance(element, (PageType, TableRegionType)):
            # aggregate text lines to text regions
            try:
                # pass ignored regions as "line labels with initial assignment",
                # i.e. identical line and region labels
                # to detect their reading order among the others
                # (these cannot be split or grouped together with other regions)
                line_labels = np.where(line_labels, line_labels + len(ignore),
                                       ignore_labels)
                # suppress separators/images in fg and try to use for partitioning slices
                sepmask = np.maximum(np.maximum(hlines, vlines),
                                     np.maximum(sep_bin, images))
                region_labels = lines2regions(
                    element_bin,
                    line_labels,
                    rlabels=ignore_labels,
                    sepmask=np.maximum(sepmask, colseps),  # add bg
                    # decide horizontal vs vertical cut when gaps of similar size
                    prefer_vertical=not isinstance(element, TableRegionType),
                    gap_height=self.parameter['gap_height'],
                    gap_width=self.parameter['gap_width'],
                    scale=scale,
                    zoom=zoom)
                LOG.info('Found %d text regions for %s "%s"',
                         len(np.unique(region_labels)) - 1, element_name,
                         element_id)
            except Exception as err:
                LOG.error('Cannot region-segment %s "%s": %s', element_name,
                          element_id, err)
                region_labels = np.where(line_labels > len(ignore),
                                         1 + len(ignore), line_labels)

            # prepare reading order group index
            if rogroup:
                if isinstance(rogroup,
                              (OrderedGroupType, OrderedGroupIndexedType)):
                    index = 0
                    # start counting from largest existing index
                    for elem in (rogroup.get_RegionRefIndexed() +
                                 rogroup.get_OrderedGroupIndexed() +
                                 rogroup.get_UnorderedGroupIndexed()):
                        if elem.index >= index:
                            index = elem.index + 1
                else:
                    index = None
            # find contours around region labels (can be non-contiguous):
            region_no = 0
            for region_label in np.unique(region_labels):
                if not region_label:
                    continue  # no bg
                region_mask = region_labels == region_label
                region_line_labels = line_labels * region_mask
                region_line_labels0 = np.setdiff1d(region_line_labels, [0])
                if not np.all(region_line_labels0 > len(ignore)):
                    # existing region from `ignore` merely to be ordered
                    # (no new region, no actual text lines)
                    region_line_labels0 = np.intersect1d(
                        region_line_labels0, ignore_labels)
                    assert len(region_line_labels0) == 1, \
                        "region label %d has both existing regions and new lines (%s)" % (
                            region_label, str(region_line_labels0))
                    region = ignore[region_line_labels0[0] - 1]
                    if rogroup and region.parent_object_ == element and not isinstance(
                            region, SeparatorRegionType):
                        index = page_add_to_reading_order(
                            rogroup, region.id, index)
                    LOG.debug('Region label %d is for ignored region "%s"',
                              region_label, region.id)
                    continue
                # normal case: new lines inside new regions
                # remove binary-empty labels, and re-order locally
                order = morph.reading_order(region_line_labels)
                order[np.setdiff1d(region_line_labels0,
                                   element_bin * region_line_labels)] = 0
                region_line_labels = order[region_line_labels]
                # avoid horizontal gaps
                region_line_labels = hmerge_line_seeds(element_bin,
                                                       region_line_labels,
                                                       scale,
                                                       seps=np.maximum(
                                                           sepmask, colseps))
                region_mask |= region_line_labels > 0
                # find contours for region (can be non-contiguous)
                regions, _ = masks2polygons(
                    region_mask * region_label,
                    element_bin,
                    '%s "%s"' % (element_name, element_id),
                    min_area=6000 / zoom / zoom,
                    simplify=ignore_labels * ~(sep_bin))
                # find contours for lines (can be non-contiguous)
                lines, _ = masks2polygons(region_line_labels,
                                          element_bin,
                                          'region "%s"' % element_id,
                                          min_area=640 / zoom / zoom)
                # create new lines in new regions (allocating by intersection)
                line_polys = [Polygon(polygon) for _, polygon in lines]
                for _, region_polygon in regions:
                    region_poly = prep(Polygon(region_polygon))
                    # convert back to absolute (page) coordinates:
                    region_polygon = coordinates_for_segment(
                        region_polygon, image, coords)
                    region_polygon = polygon_for_parent(
                        region_polygon, element)
                    if region_polygon is None:
                        LOG.warning(
                            'Ignoring extant region contour for region label %d',
                            region_label)
                        continue
                    # annotate result:
                    region_no += 1
                    region_id = element_id + "_region%04d" % region_no
                    LOG.debug('Region label %d becomes ID "%s"', region_label,
                              region_id)
                    region = TextRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon)))
                    # find out which line (contours) belong to which region (contours)
                    line_no = 0
                    for i, line_poly in enumerate(line_polys):
                        if not region_poly.intersects(line_poly):  # .contains
                            continue
                        line_label, line_polygon = lines[i]
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, image, coords)
                        line_polygon = polygon_for_parent(line_polygon, region)
                        if line_polygon is None:
                            LOG.warning(
                                'Ignoring extant line contour for region label %d line label %d',
                                region_label, line_label)
                            continue
                        # annotate result:
                        line_no += 1
                        line_id = region_id + "_line%04d" % line_no
                        LOG.debug('Line label %d becomes ID "%s"', line_label,
                                  line_id)
                        line = TextLineType(
                            id=line_id,
                            Coords=CoordsType(
                                points=points_from_polygon(line_polygon)))
                        region.add_TextLine(line)
                    # if the region has received text lines, keep it
                    if region.get_TextLine():
                        element.add_TextRegion(region)
                        LOG.info('Added region "%s" with %d lines for %s "%s"',
                                 region_id, line_no, element_name, element_id)
                        if rogroup:
                            index = page_add_to_reading_order(
                                rogroup, region.id, index)
            # add additional image/non-text regions from compute_segmentation
            # (e.g. drop-capitals or images) ...
            image_labels, num_images = morph.label(images)
            LOG.info('Found %d large non-text/image regions for %s "%s"',
                     num_images, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            image_polygons, _ = masks2polygons(
                image_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for image_label, polygon in image_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning(
                        'Ignoring extant region contour for image label %d',
                        image_label)
                    continue
                region_no += 1
                # annotate result:
                region_id = element_id + "_image%04d" % region_no
                element.add_ImageRegion(
                    ImageRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # split rulers into separator regions:
            hline_labels, num_hlines = morph.label(hlines)
            vline_labels, num_vlines = morph.label(vlines)
            LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines,
                     num_vlines, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            hline_polygons, _ = masks2polygons(
                hline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            vline_polygons, _ = masks2polygons(
                vline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for _, polygon in hline_polygons + vline_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning('Ignoring extant region contour for separator')
                    continue
                # annotate result:
                region_no += 1
                region_id = element_id + "_sep%04d" % region_no
                element.add_SeparatorRegion(
                    SeparatorRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # annotate a text/image-separated image
            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
        else:
            # get mask from region polygon:
            region_polygon = coordinates_of_segment(element, image, coords)
            region_mask = np.zeros_like(element_bin, np.bool)
            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:,
                                                                          0],
                                     region_mask.shape)] = True
            # ensure the new line labels do not extrude from the region:
            line_labels = line_labels * region_mask
            # find contours around labels (can be non-contiguous):
            line_polygons, _ = masks2polygons(line_labels,
                                              element_bin,
                                              'region "%s"' % element_id,
                                              min_area=640 / zoom / zoom)
            line_no = 0
            for line_label, polygon in line_polygons:
                # convert back to absolute (page) coordinates:
                line_polygon = coordinates_for_segment(polygon, image, coords)
                line_polygon = polygon_for_parent(line_polygon, element)
                if line_polygon is None:
                    LOG.warning(
                        'Ignoring extant line contour for line label %d',
                        line_label)
                    continue
                # annotate result:
                line_no += 1
                line_id = element_id + "_line%04d" % line_no
                element.add_TextLine(
                    TextLineType(
                        id=line_id,
                        Coords=CoordsType(
                            points=points_from_polygon(line_polygon))))
            if not sep_bin.any():
                return  # no derived image
            # annotate a text/image-separated image
            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            # update PAGE (reference the image file):
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
Exemple #18
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, mask, dpi):
        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        # check for existing text regions and whether to overwrite them
        if page.get_TextRegion() or page.get_TableRegion():
            if self.parameter['overwrite']:
                LOG.info('removing existing text/table regions in page "%s"',
                         page_id)
                page.set_TextRegion([])
            else:
                LOG.warning('keeping existing text/table regions in page "%s"',
                            page_id)
        # check if border exists
        border_polygon = None
        if page.get_Border():
            border_coords = page.get_Border().get_Coords()
            border_points = polygon_from_points(border_coords.get_points())
            border_polygon = Polygon(border_points)

        LOG.info('detecting regions on page "%s"', page_id)
        img_array = ocrolib.pil2array(page_image)
        if len(img_array.shape) <= 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        # convert to incidence matrix
        class_ids = np.array([[
            1 if category in self.parameter['active_classes'] else 0
            for category in CLASS_NAMES
        ]],
                             dtype=np.int32)
        results = self.mrcnn_model.detect([img_array],
                                          verbose=0,
                                          active_class_ids=class_ids)
        r = results[0]
        LOG.info('found %d candidates on page "%s"', len(r['rois']), page_id)

        th = self.parameter['th']
        # check for existing semgentation mask
        # this code executes only when the workflow had tiseg run before with use_deeplr=true
        if mask:
            mask = ocrolib.pil2array(mask)
            mask = mask // 255
            mask = 1 - mask
            # multiply all the bounding box part with 2
            for i in range(len(r['rois'])):

                min_y, min_x, max_y, max_x = r['rois'][i]
                mask[min_y:max_y, min_x:max_x] *= i + 2

            # check for left over pixels and add them to the bounding boxes
            pixel_added = True

            while pixel_added:

                pixel_added = False
                left_over = np.where(mask == 1)
                for y, x in zip(left_over[0], left_over[1]):
                    local_mask = mask[y - th:y + th, x - th:x + th]
                    candidates = np.where(local_mask > 1)
                    candidates = [k for k in zip(candidates[0], candidates[1])]
                    if len(candidates) > 0:
                        pixel_added = True
                        # find closest pixel with x>1
                        candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 +
                                                              (j[1] - th)**2))
                        index = local_mask[candidates[0]] - 2

                        # add pixel to mask/bbox
                        # y,x to bbox with index
                        if y < r['rois'][index][0]:
                            r['rois'][index][0] = y

                        elif y > r['rois'][index][2]:
                            r['rois'][index][2] = y

                        if x < r['rois'][index][1]:
                            r['rois'][index][1] = x

                        elif x > r['rois'][index][3]:
                            r['rois'][index][3] = x

                        # update the mask
                        mask[y, x] = index + 2

        for i in range(len(r['rois'])):
            class_id = r['class_ids'][i]
            if class_id >= len(CLASS_NAMES):
                raise Exception(
                    'Unexpected class id %d - model does not match' % class_id)

        # find hull contours on masks
        if self.parameter['use_masks']:
            r.setdefault('polygons', list())
            # estimate glyph scale (roughly)
            scale = int(dpi / 6)
            scale = scale + (scale + 1) % 2  # odd
            for i in range(len(r['rois'])):
                mask = r['masks'][:, :, i]
                mask = cv2.dilate(mask.astype(np.uint8),
                                  np.ones((scale, scale), np.uint8)) > 0
                # close mask until we have a single outer contour
                contours = None
                for _ in range(10):
                    mask = cv2.morphologyEx(
                        mask.astype(np.uint8), cv2.MORPH_CLOSE,
                        np.ones((scale, scale), np.uint8)) > 0
                    contours, _ = cv2.findContours(mask.astype(np.uint8),
                                                   cv2.RETR_EXTERNAL,
                                                   cv2.CHAIN_APPROX_SIMPLE)
                    if len(contours) == 1:
                        break
                r['polygons'].append(Polygon(
                    contours[0][:, 0, :]))  # already in x,y order

        # to reduce overlaps, apply IoU-based non-maximum suppression
        # (and other post-processing against overlaps) across classes,
        # but not on the raw pixels, but the smoothed hull polygons
        LOG.info('post-processing detections on page "%s"', page_id)
        worse = []
        if self.parameter['post_process']:
            active = True

            def _merge_rois(i, j):
                """merges i into j"""
                nonlocal r, active
                r['rois'][j][0] = min(r['rois'][i][0], r['rois'][j][0])
                r['rois'][j][1] = min(r['rois'][i][1], r['rois'][j][1])
                r['rois'][j][2] = max(r['rois'][i][2], r['rois'][j][2])
                r['rois'][j][3] = max(r['rois'][i][3], r['rois'][j][3])
                r['polygons'][j] = r['polygons'][i].union(r['polygons'][j])
                #r['scores'][j] = max(r['scores'][i], r['scores'][i])
                active = True

            # find overlapping pairs
            while active:
                active = False
                for i in range(len(r["class_ids"])):
                    if i in worse:
                        continue
                    for j in range(i + 1, len(r['class_ids'])):
                        if j in worse:
                            continue
                        iclass = r['class_ids'][i]
                        jclass = r['class_ids'][j]
                        iname = CLASS_NAMES[iclass]
                        jname = CLASS_NAMES[jclass]
                        if (iname == 'drop-capital') != (jname
                                                         == 'drop-capital'):
                            # ignore drop-capital overlapping with others
                            continue
                        # rs todo: lower priority for footnote?
                        if (r['rois'][i][1] > r['rois'][j][3]
                                or r['rois'][i][3] < r['rois'][j][1]
                                or r['rois'][i][0] > r['rois'][j][2]
                                or r['rois'][i][2] < r['rois'][j][0]):
                            # no overlap (cut)
                            continue
                        iscore = r['scores'][i]
                        jscore = r['scores'][j]
                        if not self.parameter['use_masks']:
                            LOG.debug(
                                "roi %d[%s] overlaps roi %d[%s] and %s (replacing)",
                                i, iname, j, jname,
                                "looses" if iscore < jscore else "wins")
                            if iscore < jscore:
                                worse.append(i)
                                break
                            else:
                                worse.append(j)
                                continue
                        # compare masks
                        ipoly = r['polygons'][i]
                        jpoly = r['polygons'][j]
                        isize = ipoly.area
                        jsize = jpoly.area
                        inter = ipoly.intersection(jpoly).area
                        union = ipoly.union(jpoly).area
                        # LOG.debug("%d/%d %dpx/%dpx shared %dpx overall %dpx",
                        #           i, j, isize, jsize, inter, union)
                        if inter / isize > self.parameter['min_share_drop']:
                            LOG.debug(
                                "roi %d[%s] contains roi %d[%s] (replacing)",
                                j, jname, i, iname)
                            worse.append(i)
                            break
                        elif inter / jsize > self.parameter['min_share_drop']:
                            LOG.debug(
                                "roi %d[%s] contains roi %d[%s] (replacing)",
                                i, iname, j, jname)
                            worse.append(j)
                        elif inter / union > self.parameter['min_iou_drop']:
                            LOG.debug(
                                "roi %d[%s] heavily overlaps roi %d[%s] and %s (replacing)",
                                i, iname, j, jname,
                                "looses" if iscore < jscore else "wins")
                            if iscore < jscore:
                                worse.append(i)
                                break
                            else:
                                worse.append(j)
                        elif inter / isize > self.parameter['min_share_merge']:
                            LOG.debug("roi %d[%s] covers roi %d[%s] (merging)",
                                      j, jname, i, iname)
                            worse.append(i)
                            _merge_rois(i, j)
                            break
                        elif inter / jsize > self.parameter['min_share_merge']:
                            LOG.debug("roi %d[%s] covers roi %d[%s] (merging)",
                                      i, iname, j, jname)
                            worse.append(j)
                            _merge_rois(j, i)
                        elif inter / union > self.parameter['min_iou_merge']:
                            LOG.debug(
                                "roi %d[%s] slightly overlaps roi %d[%s] and %s (merging)",
                                i, iname, j, jname,
                                "looses" if iscore < jscore else "wins")
                            if iscore < jscore:
                                worse.append(i)
                                _merge_rois(i, j)
                                break
                            else:
                                worse.append(j)
                                _merge_rois(j, i)

        # define reading order on basis of coordinates
        partial_order = np.zeros((len(r['rois']), len(r['rois'])), np.uint8)
        for i, (min_y_i, min_x_i, max_y_i, max_x_i) in enumerate(r['rois']):
            for j, (min_y_j, min_x_j, max_y_j,
                    max_x_j) in enumerate(r['rois']):
                if min_x_i < max_x_j and max_x_i > min_x_j:
                    # xoverlaps
                    if min_y_i < min_y_j:
                        partial_order[i, j] = 1
                else:
                    min_y = min(min_y_i, min_y_j)
                    max_y = max(max_y_i, max_y_j)
                    min_x = min(min_x_i, min_x_j)
                    max_x = max(max_x_i, max_x_j)
                    if next(
                        (False
                         for (min_y_k, min_x_k, max_y_k, max_x_k) in r['rois']
                         if (min_y_k < max_y and max_y_k > min_y
                             and min_x_k < max_x and max_x_k > min_x)), True):
                        # no k in between
                        if ((min_y_j + max_y_j) / 2 < min_y_i
                                and (min_y_i + max_y_i) / 2 > max_y_j):
                            # vertically unrelated
                            partial_order[j, i] = 1
                        elif max_x_i < min_x_j:
                            partial_order[i, j] = 1

        def _topsort(po):
            visited = np.zeros(po.shape[0], np.bool)
            result = list()

            def _visit(k):
                if visited[k]:
                    return
                visited[k] = True
                for l in np.nonzero(po[:, k])[0]:
                    _visit(l)
                result.append(k)

            for k in range(po.shape[0]):
                _visit(k)
            return result

        reading_order = _topsort(partial_order)

        # Creating Reading Order object in PageXML
        order_group = OrderedGroupType(caption="Regions reading order",
                                       id=page_id)
        reading_order_object = ReadingOrderType()
        reading_order_object.set_OrderedGroup(order_group)
        page.set_ReadingOrder(reading_order_object)

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_y, min_x, max_y, max_x = r['rois'][i]
            score = r['scores'][i]
            class_id = r['class_ids'][i]
            class_name = CLASS_NAMES[class_id]
            if i in worse:
                LOG.debug(
                    "Ignoring instance %d[%s] overlapping better/larger neighbour",
                    i, class_name)
                continue

            if self.parameter['use_masks']:
                region_polygon = r['polygons'][i].exterior.coords[:-1]
            else:
                region_polygon = polygon_from_bbox(
                    max(min_x - 5, 0) if class_name == 'paragraph' else min_x,
                    min_y,
                    min(max_x +
                        10, width) if class_name == 'paragraph' else max_x,
                    max_y)

            # convert to absolute coordinates
            region_polygon = coordinates_for_segment(region_polygon,
                                                     page_image, page_xywh)
            # intersect with parent and plausibilize
            cut_region_polygon = Polygon(region_polygon)
            if border_polygon:
                cut_region_polygon = border_polygon.intersection(
                    cut_region_polygon)
            if cut_region_polygon.is_empty:
                LOG.warning('region %d does not intersect page frame', i)
                continue
            if not cut_region_polygon.is_valid:
                LOG.warning('region %d has invalid polygon', i)
                continue
            region_polygon = cut_region_polygon.exterior.coords[:-1]
            region_coords = CoordsType(points_from_polygon(region_polygon),
                                       conf=score)
            read_order = reading_order.index(i)
            region_args = {
                'custom': 'readingOrder {index:' + str(read_order) + ';}',
                'id': 'region%04d' % i,
                'Coords': region_coords
            }
            if class_name == 'image':
                image_region = ImageRegionType(**region_args)
                page.add_ImageRegion(image_region)
            elif class_name == 'table':
                table_region = TableRegionType(**region_args)
                page.add_TableRegion(table_region)
            elif class_name == 'graphics':
                graphic_region = GraphicRegionType(**region_args)
                page.add_GraphicRegion(graphic_region)
            else:
                region_args['type_'] = class_name
                textregion = TextRegionType(**region_args)
                page.add_TextRegion(textregion)
            order_index = reading_order.index(i)
            regionRefIndex = RegionRefIndexedType(index=order_index,
                                                  regionRef=region_args['id'])
            order_group.add_RegionRefIndexed(regionRefIndex)
            LOG.info('added %s region on page "%s"', class_name, page_id)
Exemple #19
0
    def process(self):
        log = getLogger('processor.OcrdSbbTextlineDetectorRecognize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, input_file)

            file_id = make_file_id(input_file, self.output_file_grp)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = \
                self.workspace.image_from_page(
                        page, page_id,
                        feature_filter='cropped,binarized,grayscale_normalized'
                )

            with tempfile.TemporaryDirectory() as tmp_dirname:
                # Save the image
                image_file = tempfile.mkstemp(dir=tmp_dirname,
                                              suffix='.png')[1]
                page_image.save(image_file)

                # Segment the image
                model = self.parameter['model']
                x = textline_detector(image_file, tmp_dirname, file_id, model)
                x.run()

                # Read segmentation results
                tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml'
                tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename,
                                                        silence=True)
                tmp_page = tmp_pcgts.get_Page()

            # Create a new PAGE file from the input file
            pcgts.set_pcGtsId(file_id)
            page = pcgts.get_Page()

            # Merge results → PAGE file

            # 1. Border
            if page.get_Border():
                log.warning("Page already contained a border")
            # We need to translate the coordinates:
            text_border = tmp_page.get_Border()
            coords = text_border.get_Coords().get_points()
            polygon = polygon_from_points(coords)
            polygon_new = coordinates_for_segment(polygon, page_image,
                                                  page_coords)
            points_new = points_from_polygon(polygon_new)
            coords_new = CoordsType(points=points_new)
            text_border.set_Coords(coords_new)
            page.set_Border(text_border)

            # 2. ReadingOrder
            if page.get_ReadingOrder():
                log.warning("Page already contained a reading order")
            page.set_ReadingOrder(tmp_page.get_ReadingOrder())

            # 3. TextRegion
            if page.get_TextRegion():
                log.warning("Page already contained text regions")
            # We need to translate the coordinates:
            text_regions_new = []
            for text_region in tmp_page.get_TextRegion():
                coords = text_region.get_Coords().get_points()
                polygon = polygon_from_points(coords)
                polygon_new = coordinates_for_segment(polygon, page_image,
                                                      page_coords)
                points_new = points_from_polygon(polygon_new)
                coords_new = CoordsType(points=points_new)
                text_region.set_Coords(coords_new)
                text_regions_new.append(text_region)
            page.set_TextRegion(text_regions_new)

            # Save metadata about this operation
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=page_id,
                mimetype='application/vnd.prima.page+xml',
                local_filename=os.path.join(self.output_file_grp, file_id) +
                '.xml',
                content=ocrd_models.ocrd_page.to_xml(pcgts))
Exemple #20
0
    def process(self):
        """Performs word segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the textline level,
        and remove any existing Word elements (unless ``overwrite_words``
        is False).
        
        Set up Tesseract to detect words, and add each one to the line
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrSegmentWord')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_words = self.parameter['overwrite_words']

        with PyTessBaseAPI(
            psm=PSM.SINGLE_LINE,
            path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()
                
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in page.get_TextRegion():
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    for line in region.get_TextLine():
                        if line.get_Word():
                            if overwrite_words:
                                LOG.info('removing existing Words in line "%s"', line.id)
                                line.set_Word([])
                            else:
                                LOG.warning('keeping existing Words in line "%s"', line.id)
                        LOG.debug("Detecting words in line '%s'", line.id)
                        line_image, line_coords = self.workspace.image_from_segment(
                            line, region_image, region_coords)
                        tessapi.SetImage(line_image)
                        for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)):
                            word_id = '%s_word%04d' % (line.id, word_no)
                            word_polygon = polygon_from_xywh(component[1])
                            word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords)
                            word_polygon2 = polygon_for_parent(word_polygon, line)
                            if word_polygon2 is not None:
                                word_polygon = word_polygon2
                            word_points = points_from_polygon(word_polygon)
                            if word_polygon2 is None:
                                # could happen due to rotation
                                LOG.info('Ignoring extant word: %s', word_points)
                                continue
                            line.add_Word(WordType(
                                id=word_id, Coords=CoordsType(word_points)))
                            
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Exemple #21
0
    def process(self):
        """Replace everything below the page level with another annotation.
        
        Open and deserialize PAGE input files from both input file groups,
        then go to the page hierarchy level.
        
        Replace all regions (and their reading order) from the page of
        the first input file group with all regions from the page of
        the second input file group. Keep page-level annotations unchanged
        (i.e. Border, orientation, type, AlternativeImage etc).
        
        If ``transform_coordinates`` is true, then also retrieve the
        coordinate transform of the (cropped, deskewed, dewarped) page
        from the first input fileGrp, and use it to adjust all segment
        coordinates from the second input fileGrp, accordingly.
        (This assumes both are consistent, i.e. the second input was derived
        from the first input via ``ocrd-segment-replace-original`` or similar.)
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.ReplacePage')
        assert_file_grp_cardinality(self.input_file_grp, 2, 'original, page')
        assert_file_grp_cardinality(self.output_file_grp, 1)
        adapt_coords = self.parameter['transform_coordinates']

        # collect input file tuples
        ifts = self.zip_input_files()  # input file tuples
        # process input file tuples
        for n, ift in enumerate(ifts):
            input_file, page_file = ift
            if input_file is None or page_file is None:
                continue
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            pcgts2 = page_from_file(self.workspace.download_file(page_file))
            page2 = pcgts2.get_Page()
            # adjust all coordinates (recursively)
            if adapt_coords:
                try:
                    _, page_coords, _ = self.workspace.image_from_page(
                        page, page_id)
                    for region in page2.get_AllRegions():
                        region_coords = region.get_Coords()
                        region_polygon = polygon_from_points(
                            region_coords.points)
                        region_polygon = coordinates_for_segment(
                            region_polygon, None, page_coords)
                        region_coords.set_points(
                            points_from_polygon(region_polygon))
                        ensure_consistent(region)
                        if isinstance(region, TextRegionType):
                            for line in region.get_TextLine():
                                line_coords = line.get_Coords()
                                line_polygon = polygon_from_points(
                                    line_coords.points)
                                line_polygon = coordinates_for_segment(
                                    line_polygon, None, page_coords)
                                line_coords.set_points(
                                    points_from_polygon(line_polygon))
                                ensure_consistent(line)
                                for word in line.get_Word():
                                    word_coords = word.get_Coords()
                                    word_polygon = polygon_from_points(
                                        word_coords.points)
                                    word_polygon = coordinates_for_segment(
                                        word_polygon, None, page_coords)
                                    word_coords.set_points(
                                        points_from_polygon(word_polygon))
                                    ensure_consistent(word)
                                    for glyph in word.get_Glyph():
                                        glyph_coords = glyph.get_Coords()
                                        glyph_polygon = polygon_from_points(
                                            glyph_coords.points)
                                        glyph_polygon = coordinates_for_segment(
                                            glyph_polygon, None, page_coords)
                                        glyph_coords.set_points(
                                            points_from_polygon(glyph_polygon))
                                        ensure_consistent(glyph)
                except:
                    LOG.error('invalid coordinates on page %s', page_id)
                    continue
            # replace all regions
            page.set_ReadingOrder(page2.get_ReadingOrder())
            page.set_TextRegion(page2.get_TextRegion())
            page.set_ImageRegion(page2.get_ImageRegion())
            page.set_LineDrawingRegion(page2.get_LineDrawingRegion())
            page.set_GraphicRegion(page2.get_GraphicRegion())
            page.set_TableRegion(page2.get_TableRegion())
            page.set_ChartRegion(page2.get_ChartRegion())
            page.set_MapRegion(page2.get_MapRegion())
            page.set_SeparatorRegion(page2.get_SeparatorRegion())
            page.set_MathsRegion(page2.get_MathsRegion())
            page.set_ChemRegion(page2.get_ChemRegion())
            page.set_MusicRegion(page2.get_MusicRegion())
            page.set_AdvertRegion(page2.get_AdvertRegion())
            page.set_NoiseRegion(page2.get_NoiseRegion())
            page.set_UnknownRegion(page2.get_UnknownRegion())
            page.set_CustomRegion(page2.get_CustomRegion())

            # update METS (add the PAGE file):
            file_id = make_file_id(page_file, self.output_file_grp)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=os.path.join(
                                              self.output_file_grp,
                                              file_id + '.xml'),
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Exemple #22
0
 def sanitize_page(self, page, page_id):
     LOG = getLogger('processor.RepairSegmentation')
     regions = page.get_AllRegions(classes=['Text'])
     page_image, page_coords, _ = self.workspace.image_from_page(
         page, page_id)
     for region in regions:
         LOG.info('Sanitizing region "%s"', region.id)
         lines = region.get_TextLine()
         if not lines:
             LOG.warning('Page "%s" region "%s" contains no textlines',
                         page_id, region.id)
             continue
         heights = []
         tops = []
         # get labels:
         region_mask = np.zeros((page_image.height, page_image.width),
                                dtype=np.uint8)
         for line in lines:
             line_polygon = coordinates_of_segment(line, page_image,
                                                   page_coords)
             line_xywh = xywh_from_polygon(line_polygon)
             heights.append(line_xywh['h'])
             tops.append(line_xywh['y'])
             region_mask[draw.polygon(line_polygon[:, 1], line_polygon[:,
                                                                       0],
                                      region_mask.shape)] = 1
             region_mask[draw.polygon_perimeter(line_polygon[:, 1],
                                                line_polygon[:, 0],
                                                region_mask.shape)] = 1
         # estimate scale:
         heights = np.array(heights)
         scale = int(np.max(heights))
         tops = np.array(tops)
         order = np.argsort(tops)
         heights = heights[order]
         tops = tops[order]
         if len(lines) > 1:
             # if interline spacing is larger than line height, use this
             bottoms = tops + heights
             deltas = tops[1:] - bottoms[:-1]
             scale = max(scale, int(np.max(deltas)))
         # close labels:
         region_mask = np.pad(region_mask, scale)  # protect edges
         region_mask = np.array(morphology.binary_closing(
             region_mask, np.ones((scale, 1))),
                                dtype=np.uint8)
         region_mask = region_mask[scale:-scale, scale:-scale]  # unprotect
         # extend margins (to ensure simplified hull polygon is outside children):
         region_mask = filters.maximum_filter(region_mask,
                                              3)  # 1px in each direction
         # find outer contour (parts):
         contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL,
                                        cv2.CHAIN_APPROX_SIMPLE)
         # determine areas of parts:
         areas = [cv2.contourArea(contour) for contour in contours]
         total_area = sum(areas)
         if not total_area:
             # ignore if too small
             LOG.warning('Zero contour area in region "%s"', region.id)
             continue
         # pick contour and convert to absolute:
         region_polygon = None
         for i, contour in enumerate(contours):
             area = areas[i]
             if area / total_area < 0.1:
                 LOG.warning(
                     'Ignoring contour %d too small (%d/%d) in region "%s"',
                     i, area, total_area, region.id)
                 continue
             # simplify shape (until valid):
             # can produce invalid (self-intersecting) polygons:
             #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
             polygon = contour[:, 0, ::]  # already ordered x,y
             polygon = Polygon(polygon).simplify(1)
             polygon = make_valid(polygon)
             polygon = polygon.exterior.coords[:-1]  # keep open
             if len(polygon) < 4:
                 LOG.warning(
                     'Ignoring contour %d less than 4 points in region "%s"',
                     i, region.id)
                 continue
             if region_polygon is not None:
                 LOG.error(
                     'Skipping region "%s" due to non-contiguous contours',
                     region.id)
                 region_polygon = None
                 break
             region_polygon = coordinates_for_segment(
                 polygon, page_image, page_coords)
         if region_polygon is not None:
             LOG.info('Using new coordinates for region "%s"', region.id)
             region.get_Coords().set_points(
                 points_from_polygon(region_polygon))
Exemple #23
0
def spread_dist(lines,
                old_labels,
                new_labels,
                binarized,
                components,
                coords,
                scale=43,
                loc='',
                threshold=0.9):
    """redefine line coordinates by contourizing spread of connected components propagated from new labels"""
    LOG = getLogger('processor.OcropyResegment')
    # allocate to connected components consistently (by majority,
    # ignoring smallest components like punctuation)
    #new_labels = morph.propagate_labels_majority(binarized, new_labels)
    new_labels = morph.propagate_labels_majority(components > 0, new_labels)
    # dilate/grow labels from connected components against each other and bg
    new_labels = morph.spread_labels(new_labels, maxdist=scale / 2)
    # find polygon hull and modify line coords
    for i, line in enumerate(lines):
        new_label = new_labels == i + 1
        old_label = old_labels[i]
        if np.equal(new_label, old_label).all():
            continue
        count = np.count_nonzero(old_label)
        if not count:
            LOG.warning("skipping zero-area line '%s'", line.id)
            continue
        covers = np.count_nonzero(new_label) / count
        if covers < threshold / 3:
            LOG.debug("new line for '%s' only covers %.1f%% bg", line.id,
                      covers * 100)
            continue
        count = np.count_nonzero(old_label * binarized)
        if not count:
            LOG.warning("skipping binarizy-empty line '%s'", line.id)
            continue
        covers = np.count_nonzero(new_label * binarized) / count
        if covers < threshold:
            LOG.debug("new line for '%s' only covers %.1f%% fg", line.id,
                      covers * 100)
            continue
        LOG.debug('Black pixels before/after resegment of line "%s": %d/%d',
                  line.id, count, covers * count)
        contours = [
            contour[:, ::-1]  # get x,y order again
            for contour, area in morph.find_contours(new_label)
        ]
        #LOG.debug("joining %d subsegments for %s", len(contours), line.id)
        if len(contours) == 0:
            LOG.warning("no contours for %s - keeping", line.id)
            continue
        else:
            # get alpha shape
            poly = join_polygons(
                [make_valid(Polygon(contour)) for contour in contours],
                loc=line.id)
        poly = poly.exterior.coords[:-1]
        polygon = coordinates_for_segment(poly, None, coords)
        polygon = polygon_for_parent(polygon, line.parent_object_)
        if polygon is None:
            LOG.warning("Ignoring extant line for %s", line.id)
            continue
        line.get_Coords().set_points(points_from_polygon(polygon))
Exemple #24
0
    def _process_page(self, page, page_image, page_xywh, input_file, zoom=1.0):
        padding = self.parameter['padding']
        img_array = pil2array(page_image)
        # ensure RGB image
        if len(img_array.shape) == 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        height, width, _ = img_array.shape
        size = height * width
        # zoom to 300 DPI (larger density: faster; most fixed parameters here expect 300)
        if zoom != 1.0:
            self.logger.info("scaling %dx%d image by %.2f", width, height,
                             zoom)
            img_array = cv2.resize(img_array,
                                   None,
                                   fx=zoom,
                                   fy=zoom,
                                   interpolation=cv2.INTER_CUBIC)

        # detect rule placed in image next to page for scale reference:
        mask_array, mask_box = self.detect_ruler(img_array)
        # detect page frame via line segment detector:
        border_polygon, prefer_border = self.select_borderLine(
            img_array, mask_box)
        border_polygon = np.array(border_polygon) / zoom  # unzoom
        # pad inwards:
        border_polygon = Polygon(border_polygon).buffer(
            -padding).exterior.coords[:-1]
        # get the bounding box from the border polygon:
        # min_x, min_y = border_polygon.min(axis=0)
        # max_x, max_y = border_polygon.max(axis=0)
        # get the inner rectangle from the border polygon:
        # _, min_x, max_x, _ = np.sort(border_polygon[:,0])
        # _, min_y, max_y, _ = np.sort(border_polygon[:,1])
        if prefer_border:
            self.logger.info("Preferring line detector")
        else:
            self.logger.info("Falling back to text detector")
            textboxes = self.detect_textboxes(img_array, mask_array)
            if len(textboxes) > 1:
                textboxes = self.merge_boxes(textboxes, img_array)
            textboxes = np.array(textboxes) / zoom  # unzoom

            if (len(textboxes) == 1 and self.parameter['columnAreaMin'] * size
                    < self.get_area(textboxes[0])):
                self.logger.info("Using text area (%d%% area)",
                                 100 * self.get_area(textboxes[0]) / size)
                min_x, min_y, max_x, max_y = textboxes[0]
                # pad outwards
                border_polygon = polygon_from_bbox(min_x - padding,
                                                   min_y - padding,
                                                   max_x + padding,
                                                   max_y + padding)

        def clip(point):
            x, y = point
            x = max(0, min(page_image.width, x))
            y = max(0, min(page_image.height, y))
            return x, y

        border_polygon = coordinates_for_segment(border_polygon, page_image,
                                                 page_xywh)
        border_polygon = list(map(clip, border_polygon))
        border_points = points_from_polygon(border_polygon)
        border = BorderType(Coords=CoordsType(border_points))
        page.set_Border(border)
        # get clipped relative coordinates for current image
        page_image, page_xywh, _ = self.workspace.image_from_page(
            page, input_file.pageId, fill='background', transparency=True)
        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            page_image,
            file_id + '.IMG-CROP',
            page_id=input_file.pageId,
            file_grp=self.output_file_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Exemple #25
0
    def process(self):
        """
        Perform text recognition with Calamari on the workspace.

        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
        splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
        hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
        """
        log = getLogger('processor.CalamariRecognize')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector=self.features)

            for region in page.get_AllRegions(classes=['Text']):
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector=self.features)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                line_images_np = []
                line_coordss = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_selector=self.features)
                    if ('binarized' not in line_coords['features']
                            and 'grayscale_normalized'
                            not in line_coords['features']
                            and self.network_input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
                        log.warning(
                            "Using raw image for line '%s' in region '%s'",
                            line.id, region.id)

                    line_image = line_image if all(line_image.size) else [[0]]
                    line_image_np = np.array(line_image, dtype=np.uint8)
                    line_images_np.append(line_image_np)
                    line_coordss.append(line_coords)
                raw_results_all = self.predictor.predict_raw(
                    line_images_np, progress_bar=False)

                for line, line_coords, raw_results in zip(
                        textlines, line_coordss, raw_results_all):

                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            self.add_metadata(pcgts)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
    def process(self):
        """Performs (text) line segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the (text) region level,
        and remove any existing TextLine elements (unless ``overwrite_lines``
        is False).
        
        Set up Tesseract to detect lines, and add each one to the region
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_lines = self.parameter['overwrite_lines']
        
        with PyTessBaseAPI(
                psm=PSM.SINGLE_BLOCK,
                path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                page = pcgts.get_Page()
                
                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata() # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(type_="processingStep",
                                     name=self.ocrd_tool['steps'][0],
                                     value=TOOL,
                                     Labels=[LabelsType(
                                         externalModel="ocrd-tool",
                                         externalId="parameters",
                                         Label=[LabelType(type_=name,
                                                          value=self.parameter[name])
                                                for name in self.parameter.keys()])]))
                
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in itertools.chain.from_iterable(
                        [page.get_TextRegion()] +
                        [subregion.get_TextRegion() for subregion in page.get_TableRegion()]):
                    if region.get_TextLine():
                        if overwrite_lines:
                            LOG.info('removing existing TextLines in region "%s"', region.id)
                            region.set_TextLine([])
                        else:
                            LOG.warning('keeping existing TextLines in region "%s"', region.id)
                    LOG.debug("Detecting lines in region '%s'", region.id)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    region_polygon = coordinates_of_segment(region, region_image, region_coords)
                    region_poly = Polygon(region_polygon)
                    tessapi.SetImage(region_image)
                    for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
                        line_id = '%s_line%04d' % (region.id, line_no)
                        line_polygon = polygon_from_xywh(component[1])
                        line_poly = Polygon(line_polygon)
                        if not line_poly.within(region_poly):
                            # this could happen due to rotation
                            interline = line_poly.intersection(region_poly)
                            if interline.is_empty:
                                continue # ignore this line
                            if hasattr(interline, 'geoms'):
                                # is (heterogeneous) GeometryCollection
                                area = 0
                                for geom in interline.geoms:
                                    if geom.area > area:
                                        area = geom.area
                                        interline = geom
                                if not area:
                                    continue
                            line_poly = interline.convex_hull
                            line_polygon = line_poly.exterior.coords
                        line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords)
                        line_points = points_from_polygon(line_polygon)
                        region.add_TextLine(TextLineType(
                            id=line_id, Coords=CoordsType(line_points)))
                
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Exemple #27
0
    def process(self):
        """
        Performs the recognition.
        """

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(
                        self.predictor.predict_raw([line_image_np],
                                                   progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemple #28
0
 def _process_page(self, it, page, page_image, page_coords, page_id):
     LOG = getLogger('processor.TesserocrSegmentRegion')
     # equivalent to GetComponentImages with raw_image=True,
     # (which would also give raw coordinates),
     # except we are also interested in the iterator's BlockType() here,
     # and its BlockPolygon()
     index = 0
     ro = page.get_ReadingOrder()
     if not ro:
         ro = ReadingOrderType()
         page.set_ReadingOrder(ro)
     og = ro.get_OrderedGroup()
     if og:
         # start counting from largest existing index
         for elem in (og.get_RegionRefIndexed() +
                      og.get_OrderedGroupIndexed() +
                      og.get_UnorderedGroupIndexed()):
             if elem.index >= index:
                 index = elem.index + 1
     else:
         # new top-level group
         og = OrderedGroupType(id="reading-order")
         ro.set_OrderedGroup(og)
     while it and not it.Empty(RIL.BLOCK):
         # (padding will be passed to both BoundingBox and GetImage)
         # (actually, Tesseract honours padding only on the left and bottom,
         #  whereas right and top are increased less!)
         bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding'])
         # sometimes these polygons are not planar, which causes
         # PIL.ImageDraw.Draw.polygon (and likely others as well)
         # to misbehave; however, PAGE coordinate semantics prohibit
         # multi-path polygons!
         # (probably a bug in Tesseract itself, cf. tesseract#2826):
         if self.parameter['crop_polygons']:
             polygon = it.BlockPolygon()
         else:
             polygon = polygon_from_x0y0x1y1(bbox)
         polygon = coordinates_for_segment(polygon, page_image, page_coords)
         polygon2 = polygon_for_parent(polygon, page)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         coords = CoordsType(points=points)
         if polygon2 is None:
             LOG.info('Ignoring extant region: %s', points)
             it.Next(RIL.BLOCK)
             continue
         # if xywh['w'] < 30 or xywh['h'] < 30:
         #     LOG.info('Ignoring too small region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         # region_image_bin = it.GetBinaryImage(RIL.BLOCK)
         # if not region_image_bin.getbbox():
         #     LOG.info('Ignoring binary-empty region: %s', points)
         #     it.Next(RIL.BLOCK)
         #     continue
         #
         # add the region reference in the reading order element
         # (will be removed again if Separator/Noise region below)
         ID = "region%04d" % index
         og.add_RegionRefIndexed(
             RegionRefIndexedType(regionRef=ID, index=index))
         #
         # region type switch
         #
         block_type = it.BlockType()
         if block_type in [
                 PT.FLOWING_TEXT,
                 PT.HEADING_TEXT,
                 PT.PULLOUT_TEXT,
                 PT.CAPTION_TEXT,
                 # TABLE is contained in PTIsTextType, but
                 # it is a bad idea to create a TextRegion
                 # for it (better set `find_tables` False):
                 # PT.TABLE,
                 # will also get a 90° @orientation
                 # (but that can be overridden by deskew/OSD):
                 PT.VERTICAL_TEXT
         ]:
             region = TextRegionType(id=ID,
                                     Coords=coords,
                                     type=TextTypeSimpleType.PARAGRAPH)
             if block_type == PT.VERTICAL_TEXT:
                 region.set_orientation(90.0)
             elif block_type == PT.HEADING_TEXT:
                 region.set_type(TextTypeSimpleType.HEADING)
             elif block_type == PT.PULLOUT_TEXT:
                 region.set_type(TextTypeSimpleType.FLOATING)
             elif block_type == PT.CAPTION_TEXT:
                 region.set_type(TextTypeSimpleType.CAPTION)
             page.add_TextRegion(region)
             if self.parameter['sparse_text']:
                 region.set_type(TextTypeSimpleType.OTHER)
                 region.add_TextLine(
                     TextLineType(id=region.id + '_line', Coords=coords))
         elif block_type in [
                 PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE
         ]:
             region = ImageRegionType(id=ID, Coords=coords)
             page.add_ImageRegion(region)
         elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]:
             region = SeparatorRegionType(id=ID, Coords=coords)
             page.add_SeparatorRegion(region)
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]:
             region = MathsRegionType(id=ID, Coords=coords)
             page.add_MathsRegion(region)
         elif block_type == PT.TABLE:
             # without API access to StructuredTable we cannot
             # do much for a TableRegionType (i.e. nrows, ncols,
             # coordinates of cells for recursive regions etc),
             # but this can be achieved afterwards by segment-table
             region = TableRegionType(id=ID, Coords=coords)
             page.add_TableRegion(region)
         else:
             region = NoiseRegionType(id=ID, Coords=coords)
             page.add_NoiseRegion()
             # undo appending in ReadingOrder
             og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1])
         LOG.info("Detected region '%s': %s (%s)", ID, points,
                  membername(PT, block_type))
         #
         # iterator increment
         #
         index += 1
         it.Next(RIL.BLOCK)
     if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed()
             and not og.get_UnorderedGroupIndexed()):
         # schema forbids empty OrderedGroup
         ro.set_OrderedGroup(None)
Exemple #29
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names, mask):
        LOG = getLogger('OcrdAnybaseocrBlockSegmenter')
        # check for existing text regions and whether to overwrite them
        border = None
        if page.get_TextRegion():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextRegions in page "%s"', page_id)
                page.set_TextRegion([])
            else:
                LOG.warning('keeping existing TextRegions in page "%s"',
                            page_id)
                return
        # check if border exists
        if page.get_Border():
            border_coords = page.get_Border().get_Coords()
            border_points = polygon_from_points(border_coords.get_points())
            border = Polygon(border_points)


#            page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh)

        img_array = ocrolib.pil2array(page_image)
        page_image.save('./checkthis.png')
        if len(img_array.shape) <= 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        th = self.parameter['th']
        # check for existing semgentation mask
        # this code executes only when use_deeplr is set to True in ocrd-tool.json file
        if mask:
            mask = ocrolib.pil2array(mask)
            mask = mask // 255
            mask = 1 - mask
            # multiply all the bounding box part with 2
            for i in range(len(r['rois'])):

                min_x = r['rois'][i][0]
                min_y = r['rois'][i][1]
                max_x = r['rois'][i][2]
                max_y = r['rois'][i][3]
                mask[min_x:max_x, min_y:max_y] *= i + 2
            cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2)))

            # check for left over pixels and add them to the bounding boxes
            pixel_added = True

            while pixel_added:

                pixel_added = False
                left_over = np.where(mask == 1)
                for x, y in zip(left_over[0], left_over[1]):
                    local_mask = mask[x - th:x + th, y - th:y + th]
                    candidates = np.where(local_mask > 1)
                    candidates = [k for k in zip(candidates[0], candidates[1])]
                    if len(candidates) > 0:
                        pixel_added = True
                        # find closest pixel with x>1
                        candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 +
                                                              (j[1] - th)**2))
                        index = local_mask[candidates[0]] - 2

                        # add pixel to mask/bbox
                        # x,y to bbox with index
                        if x < r['rois'][index][0]:
                            r['rois'][index][0] = x

                        elif x > r['rois'][index][2]:
                            r['rois'][index][2] = x

                        if y < r['rois'][index][1]:
                            r['rois'][index][1] = y

                        elif y > r['rois'][index][3]:
                            r['rois'][index][3] = y

                        # update the mask
                        mask[x, y] = index + 2

        # resolving overlapping problem
        bbox_dict = {}  # to check any overlapping bbox
        class_id_check = []

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            region_bbox = [min_y, min_x, max_y, max_x]

            for key in bbox_dict:
                for bbox in bbox_dict[key]:

                    # checking for ymax case with vertical overlapping
                    # along with y, check both for xmax and xmin
                    if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][2] = bbox[1] - 1

                    # checking for ymin now
                    # along with y, check both for xmax and xmin
                    if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][0] = bbox[3] + 1

            if r['class_ids'][i] not in class_id_check:
                bbox_dict[r['class_ids'][i]] = []
                class_id_check.append(r['class_ids'][i])

            bbox_dict[r['class_ids'][i]].append(region_bbox)

        # resolving overlapping problem code

        # define reading order on basis of coordinates
        reading_order = []

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10
            reading_order.append((min_y, min_x, max_y, max_x))

        reading_order = sorted(reading_order,
                               key=lambda reading_order:
                               (reading_order[1], reading_order[0]))
        for i in range(len(reading_order)):
            min_y, min_x, max_y, max_x = reading_order[i]
            min_y = 0
            i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]])
            for j in range(i + 1, len(reading_order)):
                min_y, min_x, max_y, max_x = reading_order[j]
                j_poly = Polygon([[min_x, min_y], [max_x, min_y],
                                  [max_x, max_y], [min_x, max_y]])
                inter = i_poly.intersection(j_poly)
                if inter:
                    reading_order.insert(j + 1, reading_order[i])
                    del reading_order[i]

        # Creating Reading Order object in PageXML
        order_group = OrderedGroupType(caption="Regions reading order",
                                       id=page_id)

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]
            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]]

            if border:
                cut_region_polygon = border.intersection(
                    Polygon(region_polygon))
                if cut_region_polygon.is_empty:
                    continue
            else:
                cut_region_polygon = Polygon(region_polygon)

            order_index = reading_order.index((min_y, min_x, max_y, max_x))
            region_id = '%s_region%04d' % (page_id, i)
            regionRefIndex = RegionRefIndexedType(index=order_index,
                                                  regionRef=region_id)
            order_group.add_RegionRefIndexed(regionRefIndex)

        reading_order_object = ReadingOrderType()
        reading_order_object.set_OrderedGroup(order_group)
        page.set_ReadingOrder(reading_order_object)

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            # one change here to resolve flipped coordinates
            region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x],
                              [min_y, max_x]]

            cut_region_polygon = border.intersection(Polygon(region_polygon))

            if cut_region_polygon.is_empty:
                continue
            cut_region_polygon = [
                j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]),
                               list(cut_region_polygon.exterior.coords.xy[1]))
            ][:-1]

            # checking whether coordinates are flipped

            region_polygon = coordinates_for_segment(cut_region_polygon,
                                                     page_image, page_xywh)
            region_points = points_from_polygon(region_polygon)

            read_order = reading_order.index((min_y, min_x, max_y, max_x))

            # this can be tested, provided whether we need previous comments or not?
            # resolving overlapping problem

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  # extract from points and img_array

            region_img = ocrolib.array2pil(region_img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                region_img,
                file_id + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)

            # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features'])
            region_id = '%s_region%04d' % (page_id, i)
            coords = CoordsType(region_points)

            # incase of imageRegion
            if r['class_ids'][i] == 15:
                image_region = ImageRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # image_region.add_AlternativeImage(ai)
                page.add_ImageRegion(image_region)
                continue
            if r['class_ids'][i] == 16:
                table_region = TableRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # table_region.add_AlternativeImage(ai)
                page.add_TableRegion(table_region)
                continue
            if r['class_ids'][i] == 17:
                graphic_region = GraphicRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # graphic_region.add_AlternativeImage(ai)
                page.add_GraphicRegion(graphic_region)
                continue

            textregion = TextRegionType(custom='readingOrder {index:' +
                                        str(read_order) + ';}',
                                        id=region_id,
                                        Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            # textregion.add_AlternativeImage(ai)

            #border = page.get_Border()
            # if border:
            #    border.add_TextRegion(textregion)
            # else:
            page.add_TextRegion(textregion)
Exemple #30
0
    def process(self):
        """Resegment lines of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.

        Next, get each region image according to the layout annotation (from
        the alternative image of the region, or by cropping via coordinates
        into the higher-level image), and compute a new line segmentation
        from that (as a label mask).

        Then for each line within the region, find the label with the largest
        foreground area in the binarized image within the annotated polygon
        (or rectangle) of the line. Unless its relative area is too small,
        or its center is far off, convert that label's mask into a polygon
        outline, intersect with the old polygon, and find the contour of that
        segment. Annotate the result as new coordinates of the line.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-RESEG`` along with further
        identification of the input element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropyResegment')
        # This makes best sense for bad/coarse line segmentation, like current GT
        # or as postprocessing for bbox-only steps.
        # Most notably, it can convert rectangles to polygons (polygonalization).
        # It depends on a decent line segmentation from ocropy though. So it
        # _should_ ideally be run after deskewing (on the page or region level),
        # _must_ be run after binarization (on page or region level). Also, the
        # method's accuracy crucially depends on a good estimate of the images'
        # pixel density (at least if source input is not 300 DPI).
        threshold = self.parameter['min_fraction']
        margin = self.parameter['extend_margins']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            regions = page.get_AllRegions(classes=['Text'])
            if not regions:
                LOG.warning('Page "%s" contains no text regions', page_id)
            for region in regions:
                lines = region.get_TextLine()
                if not lines:
                    LOG.warning('Page "%s" region "%s" contains no text lines',
                                page_id, region.id)
                    continue
                if len(lines) == 1:
                    LOG.warning('Page "%s" region "%s" contains only one line',
                                page_id, region.id)
                    continue
                region_image, region_xywh = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_xywh,
                    feature_selector='binarized')
                region_array = pil2array(region_image)
                #region_array, _ = common.binarize(region_array, maxskew=0) # just in case still raw
                region_bin = np.array(region_array <= midrange(region_array),
                                      np.bool)
                report = check_region(region_bin, zoom)
                try:
                    if report:
                        raise Exception(report)
                    region_labels, _, _, _, _, _ = compute_segmentation(
                        region_bin, zoom=zoom)
                except Exception as err:
                    LOG.warning(
                        'Cannot line-segment page "%s" region "%s": %s',
                        page_id, region.id, err)
                    # fallback option 1: borderclean
                    # label margins vs interior, but with the interior
                    # extended into the margin by its connected components
                    # to remove noise from neighbouring regions:
                    #region_labels = borderclean_bin(region_bin, margin=round(4/zoom)) + 1
                    # too dangerous, because we risk losing dots from i or punctuation;
                    # fallback option2: only extend_margins
                    # instead, just provide a uniform label, so at least we get
                    # to extend the polygon margins:
                    #region_labels = np.ones_like(region_bin)
                    # fallback option3: keep unchanged
                    continue
                for line in lines:
                    if line.get_AlternativeImage():
                        # get cropped line image:
                        line_image, line_xywh = self.workspace.image_from_segment(
                            line,
                            region_image,
                            region_xywh,
                            feature_selector='binarized')
                        LOG.debug("Using AlternativeImage (%s) for line '%s'",
                                  line_xywh['features'], line.id)
                        # crop region arrays accordingly:
                        line_polygon = coordinates_of_segment(
                            line, region_image, region_xywh)
                        line_bbox = bbox_from_polygon(line_polygon)
                        line_labels = region_labels[line_bbox[1]:line_bbox[3],
                                                    line_bbox[0]:line_bbox[2]]
                        line_bin = region_bin[line_bbox[1]:line_bbox[3],
                                              line_bbox[0]:line_bbox[2]]
                        # get polygon in relative (line) coordinates:
                        line_polygon = coordinates_of_segment(
                            line, line_image, line_xywh)
                        line_polygon = resegment(line_polygon,
                                                 line_labels,
                                                 line_bin,
                                                 line.id,
                                                 extend_margins=margin,
                                                 threshold_relative=threshold)
                        if line_polygon is None:
                            continue  # not good enough – keep
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, line_image, line_xywh)
                    else:
                        # get polygon in relative (region) coordinates:
                        line_polygon = coordinates_of_segment(
                            line, region_image, region_xywh)
                        line_polygon = resegment(line_polygon,
                                                 region_labels,
                                                 region_bin,
                                                 line.id,
                                                 extend_margins=margin,
                                                 threshold_relative=threshold)
                        if line_polygon is None:
                            continue  # not good enough – keep
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, region_image, region_xywh)
                    # annotate result:
                    line.get_Coords().points = points_from_polygon(
                        line_polygon)
                    # create new image:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_xywh,
                        feature_selector='binarized')
                    # update METS (add the image file):
                    file_path = self.workspace.save_image_file(
                        line_image,
                        file_id=file_id + '_' + region.id + '_' + line.id +
                        '.IMG-RESEG',
                        page_id=page_id,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=region_xywh['features']))

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)