def sanitize_page(self, page, page_id): regions = page.get_TextRegion() page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) for region in regions: LOG.info('Sanitizing region "%s"', region.id) lines = region.get_TextLine() heights = [] # get labels: region_mask = np.zeros((page_image.height, page_image.width), dtype=np.uint8) for line in lines: line_polygon = coordinates_of_segment(line, page_image, page_coords) heights.append(xywh_from_polygon(line_polygon)['h']) region_mask[draw.polygon(line_polygon[:, 1], line_polygon[:, 0], region_mask.shape)] = 1 region_mask[draw.polygon_perimeter(line_polygon[:, 1], line_polygon[:, 0], region_mask.shape)] = 1 # estimate scale: scale = int(np.median(np.array(heights))) # close labels: region_mask = np.pad(region_mask, scale) # protect edges region_mask = filters.maximum_filter(region_mask, (scale, 1), origin=0) region_mask = filters.minimum_filter(region_mask, (scale, 1), origin=0) region_mask = region_mask[scale:-scale, scale:-scale] # unprotect # find outer contour (parts): contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # determine areas of parts: areas = [cv2.contourArea(contour) for contour in contours] total_area = sum(areas) if not total_area: # ignore if too small LOG.warning('Zero contour area in region "%s"', region.id) continue # pick contour and convert to absolute: region_polygon = None for i, contour in enumerate(contours): area = areas[i] if area / total_area < 0.1: LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"', i, area, total_area, region.id) continue # simplify shape: polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y if len(polygon) < 4: LOG.warning('Ignoring contour %d less than 4 points in region "%s"', i, region.id) continue if region_polygon is not None: LOG.error('Skipping region "%s" due to non-contiguous contours', region.id) region_polygon = None break region_polygon = coordinates_for_segment(polygon, page_image, page_coords) if region_polygon is not None: LOG.info('Using new coordinates for region "%s"', region.id) region.get_Coords().points = points_from_polygon(region_polygon)
def _set_Border(self, page, page_image, page_xywh, border_polygon): # Convert to absolute (page) coordinates: border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh) # Save border: page.set_Border( BorderType(Coords=CoordsType( points=points_from_polygon(border_polygon))))
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n): img_array = ocrolib.pil2array(page_image) # Check if image is RGB or not #FIXME: check not needed anymore? if len(img_array.shape) == 2: img_array = np.stack((img_array,)*3, axis=-1) img_array_bin = np.array( img_array > ocrolib.midrange(img_array), 'i') lineDetectH = [] lineDetectV = [] img_array_rr = self.remove_rular(img_array) textarea, img_array_rr_ta, height, width = self.detect_textarea( img_array_rr) colSeparator = int( width * self.parameter['colSeparator']) if len(textarea) > 1: textarea = self.crop_area( textarea, img_array_bin, img_array_rr_ta, colSeparator) if len(textarea) == 0: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) else: min_x, min_y, max_x, max_y = textarea[0] elif len(textarea) == 1 and (height*width*0.5 < (abs(textarea[0][2]-textarea[0][0]) * abs(textarea[0][3]-textarea[0][1]))): x1, y1, x2, y2 = textarea[0] x1 = x1-20 if x1 > 20 else 0 x2 = x2+20 if x2 < width-20 else width y1 = y1-40 if y1 > 40 else 0 y2 = y2+40 if y2 < height-40 else height min_x, min_y, max_x, max_y = textarea[0] else: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) border_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh) border_points = points_from_polygon(border_polygon) brd = BorderType(Coords=CoordsType(border_points)) page.set_Border(brd) page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y)) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file(page_image, file_id + '-IMG', page_id=page_id, file_grp=self.output_file_grp) page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_xywh['features']))
def _process_region(self, it, region, rogroup, region_image, region_coords): LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, index = 0 if rogroup: for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 while it and not it.Empty(RIL.BLOCK): bbox = it.BoundingBox(RIL.BLOCK) polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, region_image, region_coords) points = points_from_polygon(polygon) coords = CoordsType(points=points) # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (but ignore non-text regions entirely) ID = region.id + "_%04d" % index subregion = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) block_type = it.BlockType() if block_type == PT.FLOWING_TEXT: pass elif block_type == PT.HEADING_TEXT: subregion.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: subregion.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: subregion.set_type(TextTypeSimpleType.CAPTION) elif block_type == PT.VERTICAL_TEXT: subregion.set_orientation(90.0) else: it.Next(RIL.BLOCK) continue LOG.info("Detected cell '%s': %s (%s)", ID, points, membername(PT, block_type)) region.add_TextRegion(subregion) if rogroup: rogroup.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # iterator increment # index += 1 it.Next(RIL.BLOCK)
def _add_TextRegion(self, page, page_image, page_xywh, page_id, region_polygon, region_id): # Convert to absolute (page) coordinates: region_polygon = coordinates_for_segment(region_polygon, page_image, page_xywh) # Save text region: page.add_TextRegion( TextRegionType( id=page_id + region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))))
def _process_words_in_line(self, result_it, line, line_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.WORD): LOG.warning("No text in line '%s'", line.id) return # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD): word_no = 0 while result_it and not result_it.Empty(RIL.WORD): word_id = '%s_word%04d' % (line.id, word_no) LOG.debug("Decoding text in word '%s'", word_id) bbox = result_it.BoundingBox(RIL.WORD) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, line_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, line) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) word = WordType(id=word_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', points) else: line.add_Word(word) # todo: determine if font attributes available for word level will work with LSTM models word_attributes = result_it.WordFontAttributes() if word_attributes: word_style = TextStyleType( fontSize=word_attributes['pointsize'] if 'pointsize' in word_attributes else None, fontFamily=word_attributes['font_name'] if 'font_name' in word_attributes else None, bold=word_attributes['bold'] if 'bold' in word_attributes else None, italic=word_attributes['italic'] if 'italic' in word_attributes else None, underlined=word_attributes['underlined'] if 'underlined' in word_attributes else None, monospace=word_attributes['monospace'] if 'monospace' in word_attributes else None, serif=word_attributes['serif'] if 'serif' in word_attributes else None) word.set_TextStyle(word_style) # (or somewhere in custom attribute?) # add word annotation unconditionally (i.e. even for glyph level): word.add_TextEquiv(TextEquivType( Unicode=result_it.GetUTF8Text(RIL.WORD), conf=result_it.Confidence(RIL.WORD)/100)) if self.parameter['textequiv_level'] != 'word': self._process_glyphs_in_word(result_it, word, line_xywh) if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD): break else: word_no += 1 result_it.Next(RIL.WORD)
def _add_TextLine(self, page_id, region, region_image, region_xywh, region_id, line_polygon, line_id): # Convert to absolute (page) coordinates: line_polygon = coordinates_for_segment(line_polygon, region_image, region_xywh) # Save text line: region.add_TextLine( TextLineType( id=page_id + region_id + line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))))
def adapt_coords(segment, parent, transform): points = segment.get_Coords().get_points() polygon = polygon_from_points(points) # polygon absolute coords (after transforming back from page coords, e.g. deskewing) polygon_new = coordinates_for_segment(polygon, None, transform) # intersection with parent polygon polygon_new = polygon_for_parent(polygon_new, parent) if polygon_new is None: return None points_new = points_from_polygon(polygon_new) segment.set_Coords(CoordsType(points=points_new)) return segment
def ensure_consistent(child): """Clip segment element polygon to parent polygon range.""" points = child.get_Coords().points polygon = polygon_from_points(points) parent = child.parent_object_ childp = Polygon(polygon) if isinstance(parent, PageType): if parent.get_Border(): parentp = Polygon( polygon_from_points(parent.get_Border().get_Coords().points)) else: parentp = Polygon( [[0, 0], [0, parent.get_imageHeight()], [parent.get_imageWidth(), parent.get_imageHeight()], [parent.get_imageWidth(), 0]]) else: parentp = Polygon(polygon_from_points(parent.get_Coords().points)) # ensure input coords have valid paths (without self-intersection) # (this can happen when shapes valid in floating point are rounded) childp = make_valid(childp) parentp = make_valid(parentp) # check if clipping is necessary if childp.within(parentp): return # clip to parent interp = childp.intersection(parentp) if interp.is_empty or interp.area == 0.0: if hasattr(parent, 'pcGtsId'): parent_id = parent.pcGtsId elif hasattr(parent, 'imageFilename'): parent_id = parent.imageFilename else: parent_id = parent.id raise Exception("Segment '%s' does not intersect its parent '%s'" % (child.id, parent_id)) if interp.type == 'GeometryCollection': # heterogeneous result: filter zero-area shapes (LineString, Point) interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) if interp.type == 'MultiPolygon': # homogeneous result: construct convex hull to connect # FIXME: construct concave hull / alpha shape interp = interp.convex_hull if interp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity interp = asPolygon(np.round(interp.exterior.coords)) interp = make_valid(interp) polygon = interp.exterior.coords[:-1] # keep open points = points_from_polygon(polygon) child.get_Coords().set_points(points)
def add_region(region: RectSegment, index: int, region_type: str): from ocrd_utils import coordinates_for_segment, points_from_polygon polygon = polygon_from_segment(region) polygon = coordinates_for_segment(polygon, page_image, page_coords) points = points_from_polygon(polygon) indexed_id = "region%04d" % index coords = CoordsType(points=points) if region_type == "text": page.add_TextRegion( TextRegionType(id=indexed_id, Coords=coords)) elif region_type == "image": page.add_ImageRegion( ImageRegionType(id=indexed_id, Coords=coords)) else: page.add_NoiseRegion( NoiseRegionType(id=indexed_id, Coords=coords))
def _process_glyphs_in_word(self, result_it, word, word_xywh): LOG = getLogger('processor.TesserocrRecognize') if not result_it or result_it.Empty(RIL.SYMBOL): LOG.debug("No glyph in word '%s'", word.id) return # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL): glyph_no = 0 while result_it and not result_it.Empty(RIL.SYMBOL): glyph_id = '%s_glyph%04d' % (word.id, glyph_no) LOG.debug("Decoding text in glyph '%s'", glyph_id) # glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice? glyph_conf = result_it.Confidence(RIL.SYMBOL)/100 # equals first choice? #LOG.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf) bbox = result_it.BoundingBox(RIL.SYMBOL) # convert to absolute coordinates: polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox), None, word_xywh) - self.parameter['padding'] polygon2 = polygon_for_parent(polygon, word) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points)) if polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant glyph: %s', points) else: word.add_Glyph(glyph) choice_it = result_it.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_text = choice.GetUTF8Text() alternative_conf = choice.Confidence()/100 #LOG.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf) if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF or choice_no > CHOICE_THRESHOLD_NUM): break # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc glyph.add_TextEquiv(TextEquivType(index=choice_no, Unicode=alternative_text, conf=alternative_conf)) if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL): break else: glyph_no += 1 result_it.Next(RIL.SYMBOL)
def ensure_valid(element): changed = False coords = element.get_Coords() points = coords.points polygon = polygon_from_points(points) array = np.array(polygon, np.int) if array.min() < 0: array = np.maximum(0, array) changed = True if array.shape[0] < 3: array = np.concatenate([array, array[::-1] + 1]) changed = True polygon = array.tolist() poly = Polygon(polygon) if not poly.is_valid: poly = make_valid(poly) polygon = poly.exterior.coords[:-1] changed = True if changed: points = points_from_polygon(polygon) coords.set_points(points)
def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id): """Set the identified page border, if valid.""" LOG = getLogger('processor.TesserocrCrop') left, top, right, bottom = bounds if left >= right or top >= bottom: LOG.error("Cannot find valid extent for page '%s'", page_id) return padding = self.parameter['padding'] # add padding: left = max(left - padding, 0) right = min(right + padding, page_image.width) top = max(top - padding, 0) bottom = min(bottom + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom) polygon = polygon_from_bbox(left, top, right, bottom) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) if polygon is None: LOG.error("Ignoring extant border") return border = BorderType(Coords=CoordsType( points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_xywh['features']))
def convert(cocofile, directory): """Convert MS-COCO JSON to METS/PAGE XML files. Load JSON ``cocofile`` (in MS-COCO format) and chdir to ``directory`` (which it refers to). Start a METS file mets.xml with references to the image files (under fileGrp ``OCR-D-IMG``) and their corresponding PAGE-XML annotations (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as parsed from ``cocofile`` and written using the same basename. """ resolver = Resolver() with pushd_popd(directory): workspace = resolver.workspace_from_nothing('.') # https://github.com/ibm-aur-nlp/PubLayNet workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory coco = json.load(cocofile) LOG.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) categories = dict() for cat in coco['categories']: categories[cat['id']] = cat['name'] images = dict() for image in coco['images']: images[image['id']] = image for annotation in coco['annotations']: image = images[annotation['image_id']] regions = image.setdefault('regions', list()) regions.append(annotation) del coco LOG.info('Parsing annotations into PAGE-XML') for image in images.values(): page_id = 'p' + str(image['id']) file_base, file_ext = os.path.splitext(image['file_name']) filename = file_base + '.xml' image_file = workspace.add_file('OCR-D-IMG', ID='OCR-D-IMG_' + page_id, pageId=page_id, mimetype=EXT_TO_MIME[file_ext], local_filename=image['file_name']) LOG.info('Added page %s file %s of type %s', image_file.pageId, image_file.local_filename, image_file.mimetype) pcgts = page_from_image(image_file) pcgts.set_pcGtsId(page_id) page = pcgts.get_Page() assert page.imageWidth == image['width'] assert page.imageHeight == image['height'] for region in image['regions']: polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2)) coords = CoordsType(points=points_from_polygon(polygon)) category = categories[region['category_id']] region_id = 'r' + str(region['id']) if category == 'text': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.PARAGRAPH) page.add_TextRegion(region_obj) elif category == 'title': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.HEADING) # CAPTION? page.add_TextRegion(region_obj) elif category == 'list': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.LISTLABEL) # OTHER? page.add_TextRegion(region_obj) elif category == 'table': region_obj = TableRegionType(id=region_id, Coords=coords) page.add_TableRegion(region_obj) elif category == 'figure': region_obj = ImageRegionType(id=region_id, Coords=coords) page.add_ImageRegion(region_obj) else: raise Exception('unknown image category: %s' % category) page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK', ID='OCR-D-GT-SEG-BLOCK_' + page_id, pageId=page_id, mimetype=MIMETYPE_PAGE, local_filename=filename, content=to_xml(pcgts)) LOG.info('Added page %s file %s with %d regions', page_file.pageId, page_file.local_filename, len(image['regions'])) LOG.info('All done') workspace.save_mets()
def process(self): """ Perform text recognition with Calamari on the workspace. If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character hypotheses down to ``glyph_conf_cutoff`` confidence threshold. """ log = getLogger('processor.CalamariRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector=self.features) for region in page.get_AllRegions(classes=['Text']): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector=self.features) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) line_images_np = [] line_coordss = [] for line in textlines: log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, feature_selector=self.features) if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.network_input_channels == 1): # We cannot use a feature selector for this since we don't # know whether the model expects (has been trained on) # binarized or grayscale images; but raw images are likely # always inadequate: log.warning( "Using raw image for line '%s' in region '%s'", line.id, region.id) line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) line_coordss.append(line_coords) raw_results_all = self.predictor.predict_raw( line_images_np, progress_bar=False) for line, line_coords, raw_results in zip( textlines, line_coordss, raw_results_all): for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: self.add_metadata(pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def test_points_from_polygon(self): self.assertEqual( points_from_polygon([[100, 100], [200, 100], [200, 200], [100, 200]]), '100,100 200,100 200,200 100,200')
def _process_page(self, page, page_image, page_xywh, input_file, zoom=1.0): padding = self.parameter['padding'] img_array = pil2array(page_image) # ensure RGB image if len(img_array.shape) == 2: img_array = np.stack((img_array, ) * 3, axis=-1) height, width, _ = img_array.shape size = height * width # zoom to 300 DPI (larger density: faster; most fixed parameters here expect 300) if zoom != 1.0: self.logger.info("scaling %dx%d image by %.2f", width, height, zoom) img_array = cv2.resize(img_array, None, fx=zoom, fy=zoom, interpolation=cv2.INTER_CUBIC) # detect rule placed in image next to page for scale reference: mask_array, mask_box = self.detect_ruler(img_array) # detect page frame via line segment detector: border_polygon, prefer_border = self.select_borderLine( img_array, mask_box) border_polygon = np.array(border_polygon) / zoom # unzoom # pad inwards: border_polygon = Polygon(border_polygon).buffer( -padding).exterior.coords[:-1] # get the bounding box from the border polygon: # min_x, min_y = border_polygon.min(axis=0) # max_x, max_y = border_polygon.max(axis=0) # get the inner rectangle from the border polygon: # _, min_x, max_x, _ = np.sort(border_polygon[:,0]) # _, min_y, max_y, _ = np.sort(border_polygon[:,1]) if prefer_border: self.logger.info("Preferring line detector") else: self.logger.info("Falling back to text detector") textboxes = self.detect_textboxes(img_array, mask_array) if len(textboxes) > 1: textboxes = self.merge_boxes(textboxes, img_array) textboxes = np.array(textboxes) / zoom # unzoom if (len(textboxes) == 1 and self.parameter['columnAreaMin'] * size < self.get_area(textboxes[0])): self.logger.info("Using text area (%d%% area)", 100 * self.get_area(textboxes[0]) / size) min_x, min_y, max_x, max_y = textboxes[0] # pad outwards border_polygon = polygon_from_bbox(min_x - padding, min_y - padding, max_x + padding, max_y + padding) def clip(point): x, y = point x = max(0, min(page_image.width, x)) y = max(0, min(page_image.height, y)) return x, y border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh) border_polygon = list(map(clip, border_polygon)) border_points = points_from_polygon(border_polygon) border = BorderType(Coords=CoordsType(border_points)) page.set_Border(border) # get clipped relative coordinates for current image page_image, page_xywh, _ = self.workspace.image_from_page( page, input_file.pageId, fill='background', transparency=True) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features']))
def process(self): """Performs region segmentation by reading mask images in pseudo-colour. Open and deserialize each PAGE input file (or generate from image input file) from the first input file group, as well as mask image file from the second. Then iterate over all connected (equally colored) mask segments and compute convex hull contours for them. Convert them to polygons, and look up their color value in ``colordict`` to instantiate the appropriate region types (optionally with subtype). Instantiate and annotate regions accordingly. Produce a new output file by serialising the resulting hierarchy. """ colordict = self.parameter['colordict'] if not colordict: LOG.info('Using default PAGE colordict') colordict = dict(('#' + col, name) for name, col in CLASSES.items() if name) typedict = {"TextRegion": TextTypeSimpleType, "GraphicRegion": GraphicsTypeSimpleType, "ChartType": ChartTypeSimpleType} ifgs = self.input_file_grp.split(",") # input file groups if len(ifgs) != 2: raise Exception("need 2 input file groups (base and mask)") # collect input file tuples ifts = self.zip_input_files(ifgs) # input file tuples # process input file tuples for n, ift in enumerate(ifts): input_file, segmentation_file = ift LOG.info("processing page %s", input_file.pageId) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) # import mask image segmentation_filename = self.workspace.download_file(segmentation_file).local_filename with pushd_popd(self.workspace.directory): segmentation_pil = Image.open(segmentation_filename) has_alpha = segmentation_pil.mode == 'RGBA' if has_alpha: colorformat = "#%08X" else: colorformat = "#%06X" if segmentation_pil.mode != 'RGB': segmentation_pil = segmentation_pil.convert('RGB') # convert to array segmentation_array = np.array(segmentation_pil) # collapse 3 color channels segmentation_array = segmentation_array.dot( np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:]) # partition mapped colors vs background colors = np.unique(segmentation_array) bgcolors = [] for i, color in enumerate(colors): colorname = colorformat % color if (colorname not in colordict or not colordict[colorname]): #raise Exception("Unknown color %s (not in colordict)" % colorname) LOG.info("Ignoring background color %s", colorname) bgcolors.append(i) background = np.zeros_like(segmentation_array, np.uint8) if bgcolors: for i in bgcolors: background += np.array(segmentation_array == colors[i], np.uint8) colors = np.delete(colors, bgcolors, 0) # iterate over mask for each mapped color/class regionno = 0 for color in colors: # get region (sub)type colorname = colorformat % color classname = colordict[colorname] regiontype = None custom = None if ":" in classname: classname, regiontype = classname.split(":") if classname in typedict: typename = membername(typedict[classname], regiontype) if typename == regiontype: # not predefined in PAGE: use other + custom custom = "subtype:%s" % regiontype regiontype = "other" else: custom = "subtype:%s" % regiontype if classname + "Type" not in globals(): raise Exception("Unknown class '%s' for color %s in colordict" % (classname, colorname)) classtype = globals()[classname + "Type"] if classtype is BorderType: # mask from all non-background regions classmask = 1 - background else: # mask from current color/class classmask = np.array(segmentation_array == color, np.uint8) if not np.count_nonzero(classmask): continue # now get the contours and make polygons for them contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for contour in contours: # (could also just take bounding boxes to avoid islands/inclusions...) area = cv2.contourArea(contour) # filter too small regions area_pct = area / np.prod(segmentation_array.shape) * 100 if area < 100 and area_pct < 0.1: LOG.warning('ignoring contour of only %.1f%% area for %s', area_pct, classname) continue LOG.info('found region %s:%s:%s with area %.1f%%', classname, regiontype or '', custom or '', area_pct) # simplify shape poly = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y if len(poly) < 4: LOG.warning('ignoring contour of only %d points (area %.1f%%) for %s', len(poly), area_pct, classname) continue if classtype is BorderType: # add Border page.set_Border(BorderType(Coords=CoordsType(points=points_from_polygon(poly)))) break else: # instantiate region regionno += 1 region = classtype(id="region_%d" % regionno, type_=regiontype, custom=custom, Coords=CoordsType(points=points_from_polygon(poly))) # add region getattr(page, 'add_%s' % classname)(region) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(ifgs[0], self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process_lines(self, textlines, maxlevel, region_image, region_coords): edits = 0 lengs = 0 for line in textlines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) self.logger.info("Recognizing text in line '%s'", line.id) if line.get_TextEquiv(): linegt = line.TextEquiv[0].Unicode else: linegt = '' self.logger.debug("GT '%s': '%s'", line.id, linegt) # remove existing annotation below line level: line.set_TextEquiv([]) line.set_Word([]) if line_image.size[1] < 16: self.logger.debug( "ERROR: bounding box is too narrow at line %s", line.id) continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) # process ocropy: try: linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: self.logger.debug('error processing line "%s": %s', line.id, err) continue self.logger.debug("OCR '%s': '%s'", line.id, linepred) edits += Levenshtein.distance(linepred, linegt) lengs += len(linegt) words = [x.strip() for x in linepred.split(' ') if x.strip()] word_r_list = [[0]] # r-positions of every glyph in every word word_conf_list = [[]] # confidences of every glyph in every word if words != []: w_no = 0 found_char = False for i, c in enumerate(clist): if c != ' ': found_char = True word_conf_list[w_no].append(confidlist[i]) word_r_list[w_no].append(rlist[i]) if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) w_no += 1 else: word_conf_list = [[0]] word_r_list = [[0, line_image.width]] # conf for each word wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list] # conf for the line line_conf = (min(wordsconf) + max(wordsconf)) / 2 # line text line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) if maxlevel in ['word', 'glyph']: for word_no, word_str in enumerate(words): word_points = points_from_polygon( coordinates_for_segment( np.array( polygon_from_bbox( word_r_list[word_no][0] / scale, 0, word_r_list[word_no][-1] / scale, 0 + line_image.height)), line_image, line_coords)) word_id = '%s_word%04d' % (line.id, word_no) word = WordType(id=word_id, Coords=CoordsType(word_points)) line.add_Word(word) word.add_TextEquiv( TextEquivType(Unicode=word_str, conf=wordsconf[word_no])) if maxlevel == 'glyph': for glyph_no, glyph_str in enumerate(word_str): glyph_points = points_from_polygon( coordinates_for_segment( np.array( polygon_from_bbox( word_r_list[word_no][glyph_no] / scale, 0, word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)), line_image, line_coords)) glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points)) word.add_Glyph(glyph) glyph.add_TextEquiv( TextEquivType( Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) return edits, lengs
def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. Add the resulting sub-segments to the parent ``element``. If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: LOG.warning("Skipping '%s' with zero size", element_id) return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), np.bool) sep_bin = np.zeros_like(element_bin, np.bool) ignore_labels = np.zeros_like(element_bin, np.int) for i, segment in enumerate(ignore): LOG.debug('masking foreground of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, element_id) # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: # - ignored during text line segmentation (but not h/v-line detection) # - kept and reading-ordered during region segmentation (but not seps) segment_polygon = coordinates_of_segment(segment, image, coords) # If segment_polygon lies outside of element (causing # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. if isinstance(segment, SeparatorRegionType): sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], sep_bin.shape)] = True ignore_labels[draw.polygon( segment_polygon[:, 1], segment_polygon[:, 0], ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and isinstance(element.parent_object_, TableRegionType)): element_name = 'table' fullpage = True report = check_region(element_bin, zoom) else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) LOG.info('computing line segmentation for %s "%s"', element_name, element_id) # TODO: we should downscale if DPI is large enough to save time try: if report: raise Exception(report) line_labels, hlines, vlines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], maximages=self.parameter['maximages'] if element_name != 'table' else 0, csminheight=self.parameter['csminheight'], hlminwidth=self.parameter['hlminwidth']) except Exception as err: if isinstance(element, TextRegionType): LOG.error('Cannot line-segment region "%s": %s', element_id, err) # as a fallback, add a single text line comprising the whole region: element.add_TextLine( TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) return LOG.info('Found %d text lines for %s "%s"', len(np.unique(line_labels)) - 1, element_name, element_id) # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions try: # pass ignored regions as "line labels with initial assignment", # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices sepmask = np.maximum(np.maximum(hlines, vlines), np.maximum(sep_bin, images)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) LOG.info('Found %d text regions for %s "%s"', len(np.unique(region_labels)) - 1, element_name, element_id) except Exception as err: LOG.error('Cannot region-segment %s "%s": %s', element_name, element_id, err) region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 # start counting from largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 else: index = None # find contours around region labels (can be non-contiguous): region_no = 0 for region_label in np.unique(region_labels): if not region_label: continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) if not np.all(region_line_labels0 > len(ignore)): # existing region from `ignore` merely to be ordered # (no new region, no actual text lines) region_line_labels0 = np.intersect1d( region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ "region label %d has both existing regions and new lines (%s)" % ( region_label, str(region_line_labels0)) region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ == element and not isinstance( region, SeparatorRegionType): index = page_add_to_reading_order( rogroup, region.id, index) LOG.debug('Region label %d is for ignored region "%s"', region_label, region.id) continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally order = morph.reading_order(region_line_labels) order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, seps=np.maximum( sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) regions, _ = masks2polygons( region_mask * region_label, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) lines, _ = masks2polygons(region_line_labels, element_bin, 'region "%s"' % element_id, min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon in lines] for _, region_polygon in regions: region_poly = prep(Polygon(region_polygon)) # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( region_polygon, image, coords) region_polygon = polygon_for_parent( region_polygon, element) if region_polygon is None: LOG.warning( 'Ignoring extant region contour for region label %d', region_label) continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) region = TextRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: LOG.warning( 'Ignoring extant line contour for region label %d line label %d', region_label, line_label) continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) line = TextLineType( id=line_id, Coords=CoordsType( points=points_from_polygon(line_polygon))) region.add_TextLine(line) # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) LOG.info('Added region "%s" with %d lines for %s "%s"', region_id, line_no, element_name, element_id) if rogroup: index = page_add_to_reading_order( rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... image_labels, num_images = morph.label(images) LOG.info('Found %d large non-text/image regions for %s "%s"', num_images, element_name, element_id) # find contours around region labels (can be non-contiguous): image_polygons, _ = masks2polygons( image_labels, element_bin, '%s "%s"' % (element_name, element_id)) for image_label, polygon in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: LOG.warning( 'Ignoring extant region contour for image label %d', image_label) continue region_no += 1 # annotate result: region_id = element_id + "_image%04d" % region_no element.add_ImageRegion( ImageRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # split rulers into separator regions: hline_labels, num_hlines = morph.label(hlines) vline_labels, num_vlines = morph.label(vlines) LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines, num_vlines, element_name, element_id) # find contours around region labels (can be non-contiguous): hline_polygons, _ = masks2polygons( hline_labels, element_bin, '%s "%s"' % (element_name, element_id)) vline_polygons, _ = masks2polygons( vline_labels, element_bin, '%s "%s"' % (element_name, element_id)) for _, polygon in hline_polygons + vline_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: LOG.warning('Ignoring extant region contour for separator') continue # annotate result: region_no += 1 region_id = element_id + "_sep%04d" % region_no element.add_SeparatorRegion( SeparatorRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( image_clipped, file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) element.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=coords['features'] + ',clipped')) else: # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, np.bool) region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): line_polygons, _ = masks2polygons(line_labels, element_bin, 'region "%s"' % element_id, min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: LOG.warning( 'Ignoring extant line contour for line label %d', line_label) continue # annotate result: line_no += 1 line_id = element_id + "_line%04d" % line_no element.add_TextLine( TextLineType( id=line_id, Coords=CoordsType( points=points_from_polygon(line_polygon)))) if not sep_bin.any(): return # no derived image # annotate a text/image-separated image element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( image_clipped, file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): element.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=coords['features'] + ',clipped'))
def _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging): wait_for_deletion = list() reading_order = dict() ordered = False if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): regionrefs = (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()) ordered = True if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): regionrefs = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) for elem in regionrefs: reading_order[elem.get_regionRef()] = elem if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # recursive reading order element (un/ordered group): _plausibilize_group(regionspolys, elem, mark_for_deletion, mark_for_merging) for regionpoly in regionspolys: delete = regionpoly.region.id in mark_for_deletion merge = regionpoly.region.id in mark_for_merging if delete or merge: region = regionpoly.region poly = regionpoly.polygon if merge: # merge region with super region: superreg = mark_for_merging[region.id] # granularity will necessarily be lost here -- # this is not for workflows/processors that already # provide good/correct segmentation and reading order # (in which case orientation, script and style detection # can be expected as well), but rather as a postprocessor # for suboptimal segmentation (possibly before reading order # detection/correction); hence, all we now do here is # show warnings when granularity is lost; but there might # be good reasons to do more here when we have better processors # and use-cases in the future superpoly = Polygon( polygon_from_points(superreg.get_Coords().points)) superpoly = superpoly.union(poly) superreg.get_Coords().points = points_from_polygon( superpoly.exterior.coords) # FIXME should we merge/mix attributes and features? if region.get_orientation() != superreg.get_orientation(): LOG.warning( 'Merging region "%s" with orientation %f into "%s" with %f', region.id, region.get_orientation(), superreg.id, superreg.get_orientation()) if region.get_type() != superreg.get_type(): LOG.warning( 'Merging region "%s" with type %s into "%s" with %s', region.id, region.get_type(), superreg.id, superreg.get_type()) if region.get_primaryScript() != superreg.get_primaryScript(): LOG.warning( 'Merging region "%s" with primaryScript %s into "%s" with %s', region.id, region.get_primaryScript(), superreg.id, superreg.get_primaryScript()) if region.get_primaryLanguage( ) != superreg.get_primaryLanguage(): LOG.warning( 'Merging region "%s" with primaryLanguage %s into "%s" with %s', region.id, region.get_primaryLanguage(), superreg.id, superreg.get_primaryLanguage()) if region.get_TextStyle(): LOG.warning( 'Merging region "%s" with TextStyle %s into "%s" with %s', region.id, region.get_TextStyle(), # FIXME needs repr... superreg.id, superreg.get_TextStyle()) # ...to be informative if region.get_TextEquiv(): LOG.warning( 'Merging region "%s" with TextEquiv %s into "%s" with %s', region.id, region.get_TextEquiv(), # FIXME needs repr... superreg.id, superreg.get_TextEquiv()) # ...to be informative wait_for_deletion.append(region) if region.id in reading_order: regionref = reading_order[region.id] # TODO: re-assign regionref.continuation and regionref.type to other? # could be any of the 6 types above: regionrefs = rogroup.__getattribute__( regionref.__class__.__name__.replace('Type', '')) # remove in-place regionrefs.remove(regionref) if ordered: # re-index the reading order! regionrefs.sort(key=RegionRefIndexedType.get_index) for i, regionref in enumerate(regionrefs): regionref.set_index(i) for region in wait_for_deletion: if region.parent_object_: # remove in-place region.parent_object_.get_TextRegion().remove(region)
def process(self): """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements (unless ``overwrite_words`` is False). Set up Tesseract to detect words, and add each one to the line at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_words = self.parameter['overwrite_words'] with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in page.get_TextRegion(): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) for line in region.get_TextLine(): if line.get_Word(): if overwrite_words: LOG.info('removing existing Words in line "%s"', line.id) line.set_Word([]) else: LOG.warning('keeping existing Words in line "%s"', line.id) LOG.debug("Detecting words in line '%s'", line.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) tessapi.SetImage(line_image) for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) word_points = points_from_polygon(word_polygon) line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): log = getLogger('processor.OcrdSbbTextlineDetectorRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, input_file) file_id = make_file_id(input_file, self.output_file_grp) # Process the files try: os.mkdir(self.output_file_grp) except FileExistsError: pass pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = \ self.workspace.image_from_page( page, page_id, feature_filter='cropped,binarized,grayscale_normalized' ) with tempfile.TemporaryDirectory() as tmp_dirname: # Save the image image_file = tempfile.mkstemp(dir=tmp_dirname, suffix='.png')[1] page_image.save(image_file) # Segment the image model = self.parameter['model'] x = textline_detector(image_file, tmp_dirname, file_id, model) x.run() # Read segmentation results tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml' tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename, silence=True) tmp_page = tmp_pcgts.get_Page() # Create a new PAGE file from the input file pcgts.set_pcGtsId(file_id) page = pcgts.get_Page() # Merge results → PAGE file # 1. Border if page.get_Border(): log.warning("Page already contained a border") # We need to translate the coordinates: text_border = tmp_page.get_Border() coords = text_border.get_Coords().get_points() polygon = polygon_from_points(coords) polygon_new = coordinates_for_segment(polygon, page_image, page_coords) points_new = points_from_polygon(polygon_new) coords_new = CoordsType(points=points_new) text_border.set_Coords(coords_new) page.set_Border(text_border) # 2. ReadingOrder if page.get_ReadingOrder(): log.warning("Page already contained a reading order") page.set_ReadingOrder(tmp_page.get_ReadingOrder()) # 3. TextRegion if page.get_TextRegion(): log.warning("Page already contained text regions") # We need to translate the coordinates: text_regions_new = [] for text_region in tmp_page.get_TextRegion(): coords = text_region.get_Coords().get_points() polygon = polygon_from_points(coords) polygon_new = coordinates_for_segment(polygon, page_image, page_coords) points_new = points_from_polygon(polygon_new) coords_new = CoordsType(points=points_new) text_region.set_Coords(coords_new) text_regions_new.append(text_region) page.set_TextRegion(text_regions_new) # Save metadata about this operation metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=page_id, mimetype='application/vnd.prima.page+xml', local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', content=ocrd_models.ocrd_page.to_xml(pcgts))
def process(self): """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the (text) region level, and remove any existing TextLine elements (unless ``overwrite_lines`` is False). Set up Tesseract to detect lines, and add each one to the region at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_lines = self.parameter['overwrite_lines'] with PyTessBaseAPI( psm=PSM.SINGLE_BLOCK, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in itertools.chain.from_iterable( [page.get_TextRegion()] + [subregion.get_TextRegion() for subregion in page.get_TableRegion()]): if region.get_TextLine(): if overwrite_lines: LOG.info('removing existing TextLines in region "%s"', region.id) region.set_TextLine([]) else: LOG.warning('keeping existing TextLines in region "%s"', region.id) LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) region_polygon = coordinates_of_segment(region, region_image, region_coords) region_poly = Polygon(region_polygon) tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_polygon = polygon_from_xywh(component[1]) line_poly = Polygon(line_polygon) if not line_poly.within(region_poly): # this could happen due to rotation interline = line_poly.intersection(region_poly) if interline.is_empty: continue # ignore this line if hasattr(interline, 'geoms'): # is (heterogeneous) GeometryCollection area = 0 for geom in interline.geoms: if geom.area > area: area = geom.area interline = geom if not area: continue line_poly = interline.convex_hull line_polygon = line_poly.exterior.coords line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords) line_points = points_from_polygon(line_polygon) region.add_TextLine(TextLineType( id=line_id, Coords=CoordsType(line_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Extract page image and replace original with it. Open and deserialize PAGE input files and their respective images, then go to the page hierarchy level. Retrieve the image of the (cropped, deskewed, dewarped) page, preferring the last annotated form (which, depending on the workflow, could be binarized or raw). Add that image file to the workspace with the fileGrp USE given in the second position of the output fileGrp, or ``OCR-D-IMG-SUBST``. Reference that file in the page (not as AlternativeImage but) as original image. Adjust all segment coordinates accordingly. Produce a new output file by serialising the resulting hierarchy. """ try: page_grp, image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp image_grp = FALLBACK_FILEGRP_IMG LOG.info( "No output file group for images specified, falling back to '%s'", image_grp) feature_selector = self.parameter['feature_selector'] feature_filter = self.parameter['feature_filter'] adapt_coords = self.parameter['transform_coordinates'] # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, page_grp) if file_id == input_file.ID: file_id = concat_padded(page_grp, n) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter=feature_filter, feature_selector=feature_selector) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None # annotate extracted image file_path = self.workspace.save_image_file( page_image, file_id.replace(page_grp, image_grp), image_grp, page_id=input_file.pageId, mimetype='image/png') # replace original image page.set_imageFilename(file_path) # adjust all coordinates if adapt_coords: for region in page.get_AllRegions(): region_polygon = coordinates_of_segment( region, page_image, page_coords) region.get_Coords().points = points_from_polygon( region_polygon) if isinstance(region, TextRegionType): for line in region.get_TextLine(): line_polygon = coordinates_of_segment( line, page_image, page_coords) line.get_Coords().points = points_from_polygon( line_polygon) for word in line.get_Word(): word_polygon = coordinates_of_segment( word, page_image, page_coords) word.get_Coords().points = points_from_polygon( word_polygon) for glyph in word.get_Glyph(): glyph_polygon = coordinates_of_segment( glyph, page_image, page_coords) glyph.get_Coords( ).points = points_from_polygon( glyph_polygon) # update METS (add the PAGE file): file_path = os.path.join(page_grp, file_id + '.xml') out = self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, page_grp, out.local_filename)
def process(self): """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements (unless ``overwrite_words`` is False). Set up Tesseract to detect words, and add each one to the line at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrSegmentWord') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_words = self.parameter['overwrite_words'] with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in page.get_TextRegion(): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) for line in region.get_TextLine(): if line.get_Word(): if overwrite_words: LOG.info('removing existing Words in line "%s"', line.id) line.set_Word([]) else: LOG.warning('keeping existing Words in line "%s"', line.id) LOG.debug("Detecting words in line '%s'", line.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) tessapi.SetImage(line_image) for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) word_polygon2 = polygon_for_parent(word_polygon, line) if word_polygon2 is not None: word_polygon = word_polygon2 word_points = points_from_polygon(word_polygon) if word_polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', word_points) continue line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """ Performs the recognition. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list( self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) padding = self.parameter['padding'] with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 # warn of existing segmentation: regions = page.get_TextRegion() if regions: min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 for region in regions: left, top, right, bottom = bbox_from_points( region.get_Coords().points) min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.warning( 'Ignoring extent from existing TextRegions: %i:%i,%i:%i', min_x, max_x, min_y, max_y) LOG.debug("Cropping with Tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 # iterate over all text blocks and compare their # bbox extent to the running min and max values for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element # ID = "region%04d" % index left, top, right, bottom = bbox_from_xywh(xywh) LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID, left, right, top, bottom) # filter region results: bin_bbox = image.getbbox() if not bin_bbox: # this does happen! LOG.info( "Ignoring region '%s' because its binarization is empty", ID) continue width = bin_bbox[2] - bin_bbox[0] if width < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.info( "Ignoring region '%s' because its width is too small (%d)", ID, width) continue height = bin_bbox[3] - bin_bbox[1] if height < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.debug( "Ignoring region '%s' because its height is too small (%d)", ID, height) continue min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # if min_x < max_x and min_y < max_y: # add padding: min_x = max(min_x - padding, 0) max_x = min(max_x + padding, page_image.width) min_y = max(min_y - padding, 0) max_y = min(max_y + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) polygon = polygon_from_bbox(min_x, min_y, max_x, max_y) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) border = BorderType( Coords=CoordsType(points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon( coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features'])) else: LOG.error("Cannot find valid extent for page '%s'", page_id) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names, mask): LOG = getLogger('OcrdAnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them border = None if page.get_TextRegion(): if self.parameter['overwrite']: LOG.info('removing existing TextRegions in page "%s"', page_id) page.set_TextRegion([]) else: LOG.warning('keeping existing TextRegions in page "%s"', page_id) return # check if border exists if page.get_Border(): border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) border = Polygon(border_points) # page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh) img_array = ocrolib.pil2array(page_image) page_image.save('./checkthis.png') if len(img_array.shape) <= 2: img_array = np.stack((img_array, ) * 3, axis=-1) results = mrcnn_model.detect([img_array], verbose=1) r = results[0] th = self.parameter['th'] # check for existing semgentation mask # this code executes only when use_deeplr is set to True in ocrd-tool.json file if mask: mask = ocrolib.pil2array(mask) mask = mask // 255 mask = 1 - mask # multiply all the bounding box part with 2 for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] mask[min_x:max_x, min_y:max_y] *= i + 2 cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2))) # check for left over pixels and add them to the bounding boxes pixel_added = True while pixel_added: pixel_added = False left_over = np.where(mask == 1) for x, y in zip(left_over[0], left_over[1]): local_mask = mask[x - th:x + th, y - th:y + th] candidates = np.where(local_mask > 1) candidates = [k for k in zip(candidates[0], candidates[1])] if len(candidates) > 0: pixel_added = True # find closest pixel with x>1 candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 + (j[1] - th)**2)) index = local_mask[candidates[0]] - 2 # add pixel to mask/bbox # x,y to bbox with index if x < r['rois'][index][0]: r['rois'][index][0] = x elif x > r['rois'][index][2]: r['rois'][index][2] = x if y < r['rois'][index][1]: r['rois'][index][1] = y elif y > r['rois'][index][3]: r['rois'][index][3] = y # update the mask mask[x, y] = index + 2 # resolving overlapping problem bbox_dict = {} # to check any overlapping bbox class_id_check = [] for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] region_bbox = [min_y, min_x, max_y, max_x] for key in bbox_dict: for bbox in bbox_dict[key]: # checking for ymax case with vertical overlapping # along with y, check both for xmax and xmin if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1] and ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and r['class_ids'][i] != 5): r['rois'][i][2] = bbox[1] - 1 # checking for ymin now # along with y, check both for xmax and xmin if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1] and ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and r['class_ids'][i] != 5): r['rois'][i][0] = bbox[3] + 1 if r['class_ids'][i] not in class_id_check: bbox_dict[r['class_ids'][i]] = [] class_id_check.append(r['class_ids'][i]) bbox_dict[r['class_ids'][i]].append(region_bbox) # resolving overlapping problem code # define reading order on basis of coordinates reading_order = [] for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 reading_order.append((min_y, min_x, max_y, max_x)) reading_order = sorted(reading_order, key=lambda reading_order: (reading_order[1], reading_order[0])) for i in range(len(reading_order)): min_y, min_x, max_y, max_x = reading_order[i] min_y = 0 i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) for j in range(i + 1, len(reading_order)): min_y, min_x, max_y, max_x = reading_order[j] j_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) inter = i_poly.intersection(j_poly) if inter: reading_order.insert(j + 1, reading_order[i]) del reading_order[i] # Creating Reading Order object in PageXML order_group = OrderedGroupType(caption="Regions reading order", id=page_id) for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] if border: cut_region_polygon = border.intersection( Polygon(region_polygon)) if cut_region_polygon.is_empty: continue else: cut_region_polygon = Polygon(region_polygon) order_index = reading_order.index((min_y, min_x, max_y, max_x)) region_id = '%s_region%04d' % (page_id, i) regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_id) order_group.add_RegionRefIndexed(regionRefIndex) reading_order_object = ReadingOrderType() reading_order_object.set_OrderedGroup(order_group) page.set_ReadingOrder(reading_order_object) for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 # one change here to resolve flipped coordinates region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] cut_region_polygon = border.intersection(Polygon(region_polygon)) if cut_region_polygon.is_empty: continue cut_region_polygon = [ j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), list(cut_region_polygon.exterior.coords.xy[1])) ][:-1] # checking whether coordinates are flipped region_polygon = coordinates_for_segment(cut_region_polygon, page_image, page_xywh) region_points = points_from_polygon(region_polygon) read_order = reading_order.index((min_y, min_x, max_y, max_x)) # this can be tested, provided whether we need previous comments or not? # resolving overlapping problem region_img = img_array[min_x:max_x, min_y: max_y] # extract from points and img_array region_img = ocrolib.array2pil(region_img) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( region_img, file_id + "_" + str(i), page_id=page_id, file_grp=self.output_file_grp) # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) region_id = '%s_region%04d' % (page_id, i) coords = CoordsType(region_points) # incase of imageRegion if r['class_ids'][i] == 15: image_region = ImageRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # image_region.add_AlternativeImage(ai) page.add_ImageRegion(image_region) continue if r['class_ids'][i] == 16: table_region = TableRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # table_region.add_AlternativeImage(ai) page.add_TableRegion(table_region) continue if r['class_ids'][i] == 17: graphic_region = GraphicRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # graphic_region.add_AlternativeImage(ai) page.add_GraphicRegion(graphic_region) continue textregion = TextRegionType(custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # textregion.add_AlternativeImage(ai) #border = page.get_Border() # if border: # border.add_TextRegion(textregion) # else: page.add_TextRegion(textregion)
def _process_segment(self, page_image, page, textregion, region_xywh, page_id, input_file, n): LOG = getLogger('OcrdAnybaseocrTextline') #check for existing text lines and whether to overwrite them if textregion.get_TextLine(): if self.parameter['overwrite']: LOG.info('removing existing TextLines in region "%s"', page_id) textregion.set_TextLine([]) else: LOG.warning('keeping existing TextLines in region "%s"', page_id) return binary = ocrolib.pil2array(page_image) if len(binary.shape) > 2: binary = np.mean(binary, 2) binary = np.array(1 - binary / np.amax(binary), 'B') if self.parameter['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = self.parameter['scale'] if np.isnan( scale) or scale > 1000.0 or scale < self.parameter['minscale']: LOG.warning(str(scale) + ": bad scale; skipping!\n") return segmentation = self.compute_segmentation(binary, scale) if np.amax(segmentation) > self.parameter['maxlines']: LOG.warning("too many lines %i; skipping!\n", (np.amax(segmentation))) return lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] lines = [lines[i] for i in lsort] cleaned = ocrolib.remove_noise(binary, self.parameter['noise']) for i, l in enumerate(lines): #LOG.info('check this: ') #LOG.info(type(l.bounds)) #LOG.info(l.bounds) #line_points = np.where(l.mask==1) #hull = MultiPoint([x for x in zip(line_points[0],line_points[1])]).convex_hull #x,y = hull.exterior.coords.xy #LOG.info('hull coords x: ',x) #LOG.info('hull coords y: ',y) min_x, max_x = (l.bounds[0].start, l.bounds[0].stop) min_y, max_y = (l.bounds[1].start, l.bounds[1].stop) line_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] #line_polygon = [x for x in zip(y, x)] line_polygon = coordinates_for_segment(line_polygon, page_image, region_xywh) line_points = points_from_polygon(line_polygon) img = cleaned[l.bounds[0], l.bounds[1]] img = np.array(255 * (img > ocrolib.midrange(img)), 'B') img = 255 - img img = ocrolib.array2pil(img) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( img, file_id + "_" + str(n) + "_" + str(i), page_id=page_id, file_grp=self.output_file_grp) ai = AlternativeImageType(filename=file_path, comments=region_xywh['features']) line_id = '%s_line%04d' % (page_id, i) line = TextLineType(custom='readingOrder {index:' + str(i) + ';}', id=line_id, Coords=CoordsType(line_points)) line.add_AlternativeImage(ai) textregion.add_TextLine(line)